目的:演示常见几种分类器的使用方法,对其超参数调优候选超参数的选择

0、示例数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
iris_X, iris_y = iris.data, iris.target
iris_X = StandardScaler().fit_transform(iris_X)
train_X, test_X, train_y, test_y = train_test_split(iris_X, iris_y, test_size=0.3)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
# ((105, 4), (45, 4), (105,), (45,))

1、K近邻

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
param_grid = {"n_neighbors": [3, 5, 7, 10, 20],
              "p": [1, 2],
              "weights": ["uniform", "distance"]}
grid_search = GridSearchCV(model_knn, param_grid, cv=5,
                           scoring="accuracy", n_jobs=-1)
grid_search.fit(train_X, train_y)    

print(grid_search.best_params_)
print(grid_search.best_score_)
knn_grid_search = grid_search
# {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}
# 0.9714285714285715

2、逻辑回归

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.01, 0.1, 1, 10, 100],
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [1000]
    }
]
grid_search = GridSearchCV(model_logistic, param_grid, cv=5,
                           scoring="accuracy", n_jobs=1)
grid_search.fit(train_X, train_y)          

print(grid_search.best_params_)
print(grid_search.best_score_)
logistic_grid_search = grid_search 
# {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
# 0.980952380952381

3、贝叶斯算法

1
2
3
4
5
6
from sklearn.naive_bayes import GaussianNB
model_bayes = GaussianNB()
scores = cross_val_score(model_bayes, train_X, train_y, scoring="accuracy", cv=10)
print(scores.mean())
# 0.96
bayes_cv = scores

4、支持向量机

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
from sklearn.svm import SVC

model_svm = SVC()
param_grid = [    
    {'C' : [0.01, 0.1, 1, 10, 100],
     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
     'gamma'  : ['scale','auto']
    }
]
grid_search = GridSearchCV(model_svm, param_grid, cv=5,
                           scoring="accuracy", n_jobs=1)
grid_search.fit(train_X, train_y)      

print(grid_search.best_params_)
print(grid_search.best_score_)
svm_grid_search = grid_search
# {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
# 0.980952380952381

5、随机森林

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
param_grid = [    
    {'n_estimators' : [100, 200, 300, 500, 1000],
     'criterion' : ['gini', 'entropy'],
     'max_depth'  : [4, 8, 16, 32],
     'max_features' : ["sqrt", "log2"]
    }
]
grid_search = GridSearchCV(model_rf, param_grid, cv=5,
                           scoring="accuracy", n_jobs=10)
grid_search.fit(train_X, train_y)   

print(grid_search.best_params_)
print(grid_search.best_score_)
rf_grid_search = grid_search
# {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500}
# 0.9523809523809523

6、梯度增加机

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from sklearn.ensemble import GradientBoostingClassifier

model_gbm = GradientBoostingClassifier()
param_grid = [    
    {'learning_rate' : [0.001, 0.01, 0.1],
     'n_estimators' : [100, 200, 300, 500, 1000],
     'subsample'  : [0.5, 0.7, 1],
     'criterion' : ["friedman_mse", "squared_error"]
    }
]
grid_search = GridSearchCV(model_gbm, param_grid, cv=5,
                           scoring="accuracy", n_jobs=10)
grid_search.fit(train_X, train_y)      

print(grid_search.best_params_)
print(grid_search.best_score_)
gbm_grid_search = grid_search
# {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5}
# 0.9619047619047618

7、XGBoost

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from xgboost import XGBClassifier

model_xgb = XGBClassifier()
param_grid = [    
    {'n_estimators' : [10,  30, 50],
     'learning_rate' : [0.01, 0.1],
     'subsample'  : [0.5, 0.7, 1],
     'colsample_bytree' : [0.5, 0.7, 1]
    }
]
grid_search = GridSearchCV(model_xgb, param_grid, cv=5,
                           scoring="accuracy", n_jobs=10)
grid_search.fit(train_X, train_y)           

print(grid_search.best_params_)
print(grid_search.best_score_)
xgb_grid_search = grid_search
# {'colsample_bytree': 1, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5}
# 0.9619047619047618

简单比较

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import pandas as pd
pd.DataFrame({
    "KNN" : knn_grid_search.best_score_,
    "Logistic" : logistic_grid_search.best_score_,
    "Bayes" : bayes_cv.mean(),
    "SVM" : svm_grid_search.best_score_,
    "RF" : rf_grid_search.best_score_,
    "GBM" : gbm_grid_search.best_score_,
    "XGB" : xgb_grid_search.best_score_
}, index=["score"]).T.plot.line()

image-20220904170954206