3
【sklearn】一次训练几十个模型
source link: https://www.guofei.site/2019/10/01/all_models.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
【sklearn】一次训练几十个模型
2019年10月01日Author: Guofei
文章归类: 2-1-有监督学习 ,文章编号: 99
版权声明:本文作者是郭飞。转载随意,但需要标明原文链接,并通知本人
原文链接:https://www.guofei.site/2019/10/01/all_models.html
实现功能:
- score自定义
- cv方法自定义
- 输出最优超参数
- 显示较为美观
- 模型中多个超参数一样好的,并列输出
分类
import numpy as np
import pandas as pd
from sklearn import linear_model, tree, naive_bayes, neighbors, svm, neural_network, ensemble
from sklearn import model_selection, datasets, metrics
X, y = datasets. \
make_classification(n_samples=1000,
n_features=10,
n_informative=2,
n_redundant=3, # 用 n_informative 线性组合出这么多个特征
n_repeated=3, # 用 n_informative+n_redundant 线性组合出这么多个特征
n_classes=2,
flip_y=0.1
)
# 定义 CV 方法
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=0)
# 定义最优的标准
scoring = 'accuracy'
all_models = [
['LogisticRegression', linear_model.LogisticRegression(), dict()],
['DecisionTreeClassifier', tree.DecisionTreeClassifier(), dict()],
['GaussianNB', naive_bayes.GaussianNB(), dict()],
# ['MultinomialNB', naive_bayes.MultinomialNB(),dict(],
['BernoulliNB', naive_bayes.BernoulliNB(), dict()],
['KNeighborsClassifier', neighbors.KNeighborsClassifier(), dict()],
['LinearSVC', svm.LinearSVC(), dict()],
['SVC', svm.SVC(), dict()],
['MLPClassifier', neural_network.MLPClassifier(), dict()],
['AdaBoostClassifier', ensemble.AdaBoostClassifier(), dict()],
['GradientBoostingClassifier', ensemble.GradientBoostingClassifier(), dict()],
['RandomForestClassifier', ensemble.RandomForestClassifier(), dict()],
]
for models in all_models:
model_name = models[0]
model = models[1]
param_grid = models[2]
gscv = model_selection.GridSearchCV(estimator=model,
scoring=scoring, # 打分
param_grid=param_grid,
cv=cv, # cv 方法
return_train_score=True, # 默认不返回 train 的score
refit=False, # 默认为 True, 用最好的模型+全量数据再次训练,用 gscv.best_estimator_ 获取最好模型
n_jobs=-1)
gscv.fit(X, y)
cv_results_ = gscv.cv_results_
idx_best_test_score = (cv_results_['rank_test_score'] == 1)
models.extend([gscv.best_score_,
cv_results_['mean_test_score'][idx_best_test_score],
cv_results_['std_test_score'][idx_best_test_score],
cv_results_['mean_train_score'][idx_best_test_score],
cv_results_['std_train_score'][idx_best_test_score],
np.array(cv_results_['params'])[idx_best_test_score],
cv_results_])
print(models[0], gscv.best_score_)
# %% 输出
df = pd.DataFrame(all_models, columns=['model_name', 'model', 'param_grid', 'best_score_',
'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score',
'best_params', 'cv_results_'])
df.sort_values('best_score_', ascending=False, inplace=True)
df_describe = df.loc[:, ['model_name', 'best_score_',
'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score',
'best_params']]
df_describe
用最好的模型重新全量训练
best_model = df.iloc[0,:].model
best_model.fit(X, y)
回归
import numpy as np
import pandas as pd
from sklearn import linear_model, tree, naive_bayes, neighbors, svm, neural_network
from sklearn import ensemble
from sklearn import model_selection
from sklearn import datasets
from sklearn import metrics
from sklearn import datasets
X, y, coef = datasets.make_regression(n_samples=10000,
n_features=5,
n_informative=3, # 其中,3个feature是有信息的
n_targets=1, # 多少个 target
bias=1, # 就是 intercept
coef=True, # 为True时,会返回真实的coef值
noise=1,
tail_strength=1
)
# 定义 CV 方法
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=0)
# 定义最优的标准
scoring = 'r2'
all_models = [
['LinearRegression', linear_model.LinearRegression(), dict()],
['Ridge', linear_model.Ridge(), dict()],
['Lasso', linear_model.Lasso(), dict()],
['ElasticNet', linear_model.ElasticNet(), dict()],
['DecisionTreeRegressor', tree.DecisionTreeRegressor(), dict()],
['KNeighborsRegressor', neighbors.KNeighborsRegressor(), dict()],
['LinearSVR', svm.LinearSVR(), dict()],
['SVR', svm.SVR(), dict()],
['neural_network', neural_network.MLPRegressor(), dict()],
['AdaBoostRegressor', ensemble.AdaBoostRegressor(), dict()],
['GradientBoostingRegressor', ensemble.GradientBoostingRegressor(), dict()],
['RandomForestRegressor', ensemble.RandomForestRegressor(), dict()],
]
for models in all_models:
model_name = models[0]
model = models[1]
param_grid = models[2]
gscv = model_selection.GridSearchCV(estimator=model,
scoring=scoring, # 打分
param_grid=param_grid,
cv=cv, # cv 方法
return_train_score=True, # 默认不返回 train 的score
refit=False, # 默认为 True, 用最好的模型+全量数据再次训练,用 gscv.best_estimator_ 获取最好模型
n_jobs=-1)
gscv.fit(X, y)
cv_results_ = gscv.cv_results_
idx_best_test_score = (cv_results_['rank_test_score'] == 1)
models.extend([gscv.best_score_,
cv_results_['mean_test_score'][idx_best_test_score],
cv_results_['std_test_score'][idx_best_test_score],
cv_results_['mean_train_score'][idx_best_test_score],
cv_results_['std_train_score'][idx_best_test_score],
np.array(cv_results_['params'])[idx_best_test_score],
cv_results_])
print(models[0], gscv.best_score_)
#%% 输出
df = pd.DataFrame(all_models, columns=['model_name', 'model', 'param_grid', 'best_score_',
'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score',
'best_params', 'cv_results_'])
df.sort_values('best_score_', ascending=False, inplace=True)
df_describe = df.loc[:, ['model_name', 'best_score_',
'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score',
'best_params']]
df_describe
全量数据训练最佳模型
best_model = df.iloc[0,:].model
best_model.fit(X, y)
您的支持将鼓励我继续创作!
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK