入门数据挖掘3-建模与调参

3. 建模与调参

3.1 载入数据

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#    reduce_mem_usage函数通过调整数据类型，帮助我们减少数据在内存中占用的时间
def reduce_mem_usage(df):
    
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df  

sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))

3.2 线性回归

continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model']]

sample_feature = sample_feature.dropna().replace('-',0).reset_index(drop=True)
sample_feature

1 2	# 将非数值型类型转换为数字型类型 sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)

#    划分为特征向量和标签向量
train = sample_feature[continuous_feature_names + ['price']]
train_X = train[continuous_feature_names]
train_y = train['price']

使用sklearn简单建模

1
2
3

from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)

绘制特征v_9的值与标签的散点图，图片发现模型的预测结果（蓝色点）与真实标签（黑色点）的分布差异较大，且部分预测值出现了小于0的情况，说明我们的模型存在一些问题

from matplotlib import pyplot as plt
subsample_index = np.random.randint(low=0, high=len(train_y), size=50)
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()

这是由于咱们在EDA就发现的情况，即price的分布存在长尾问题，所以我们对price进行log变换后再进行训练。

train_y_ln = np.log(train_y+1)
model = model.fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price seems normal after np.log transforming')
plt.show()

看起来，已经比上面那张图要好很多。

3.3 k折交叉验证

在使用训练集对参数进行训练的时候，经常会发现人们通常会将一整个训练集分为三个部分（比如mnist手写训练集）。一般分为：训练集（train_set），评估集（valid_set），测试集（test_set）这三个部分。这其实是为了保证训练效果而特意设置的。其中测试集很好理解，其实就是完全不参与训练的数据，仅仅用来观测测试效果的数据。而训练集和评估集则牵涉到下面的知识了。

因为在实际的训练中，训练的结果对于训练集的拟合程度通常还是挺好的（初始条件敏感），但是对于训练集之外的数据的拟合程度通常就不那么令人满意了。因此我们通常并不会把所有的数据集都拿来训练，而是分出一部分来（这一部分不参加训练）对训练集生成的参数进行测试，相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证（Cross Validation）

%matplotlib inline

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')
#     载入数据
data = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))
#    切分训练集与测试集
train_data = data[data['train']==1]
test_data = data[data['train']==0]
#    特征
features = [x for x in data.columns if x not in ['price','SaleID']]
#y = np.log(train_data['price']+1)
#    特征向量以及标签
X = train_data[features]
y = train_data['price']


#    k折交叉验证
k = 5
fold=StratifiedKFold(n_splits=k,shuffle=True,random_state=0)

models = []

for train_idx, val_idx in fold.split(X, y):

    model = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=2000)
    model.fit(X.iloc[train_idx], y.iloc[train_idx],eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],eval_metric='l1',early_stopping_rounds=5)
    
    
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    
    score = mean_absolute_error(y.iloc[val_idx],val_pred)
    #scores.append(score)
    print( '--------------')
    print('val mae:', np.mean(score))

3.4 嵌入式特征选择

嵌入式特征选择是将特征选择过程与学习器训练过程融为一体，两者在同一个优化过程中完成，即在学习器训练过程中自动地进行了特征选择。

from sklearn.feature_selection import SelectFromModel
clf = lgb.LGBMRegressor()
embeded_lgb_selector = SelectFromModel(clf, max_features=50)
embeded_lgb_selector.fit(X, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features') 

embeded_lgb_feature

但是我用筛选出来地这27个特征重新做5折交叉验证，效果却比不做特征筛选地要差。

3.5 GridSearchCV 网格搜索调参

GridSearchCV：一种调参的方法，当你算法模型效果不是很好时，可以通过该方法来调整参数，通过循环遍历，尝试每一种参数组合，返回最好的得分值的参数组合。

from sklearn.model_selection import GridSearchCV
objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair']
num_leaves = [3,5,10,15,20,40, 55]
max_depth = [3,5,10,15,20,40, 55]
parameters = {'objective': objective, 'num_leaves': num_leaves, 'max_depth': max_depth}
model = lgb.LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(X, y)
clf.best_params_

3.6 贝叶斯调参

贝叶斯调参：贝叶斯优化是一种用模型找到函数最小值方法，已经应用于机器学习问题中的超参数搜索，这种方法性能好，同时比随机搜索省时。贝叶斯优化通过基于目标函数的过去评估结果建立替代函数（概率模型），来找到最小化目标函数的值。贝叶斯方法与随机或网格搜索的不同之处在于，它在尝试下一组超参数时，会参考之前的评估结果，因此可以省去很多无用功。

贝叶斯优化问题的四个部分：

+  **目标函数**：即我们想要最小化的内容。在这里，目标函数是机器学习模型使用该组超参数在验证集上的损失。
+  **域空间**：要搜索的超参数的取值范围。
+  **优化算法**: 构造替代函数并选择下一个超参数值进行评估的方法。
+  **结果历史记录**：来自目标函数评估的存储结果，包括超参数和验证集上的损失。