机器学习基于sklearn(3)--常见分类任务学习器
目的:演示常见几种分类器的使用方法,对其超参数调优候选超参数的选择 0、示例数据 1 2 3 4 5 6 7 8 9 10 11 12 from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV iris = datasets.load_iris() iris_X, iris_y = iris.data, iris.target iris_X = StandardScaler().fit_transform(iris_X) train_X, test_X, train_y, test_y = train_test_split(iris_X, iris_y, test_size=0.3) train_X.shape, test_X.shape, train_y.shape, test_y.shape # ((105, 4), (45, 4), (105,), (45,)) 1、K近邻 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from sklearn.neighbors import KNeighborsClassifier model_knn = KNeighborsClassifier() param_grid = {"n_neighbors": [3, 5, 7, 10, 20], "p": [1, 2], "weights": ["uniform", "distance"]} grid_search = GridSearchCV(model_knn, param_grid, cv=5, scoring="accuracy", n_jobs=-1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) knn_grid_search = grid_search # {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'} # 0.9714285714285715 2、逻辑回归 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.linear_model import LogisticRegression model_logistic = LogisticRegression() param_grid = [ {'penalty' : ['l1', 'l2', 'elasticnet', 'none'], 'C' : [0.01, 0.1, 1, 10, 100], 'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'], 'max_iter' : [1000] } ] grid_search = GridSearchCV(model_logistic, param_grid, cv=5, scoring="accuracy", n_jobs=1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) logistic_grid_search = grid_search # {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'} # 0.980952380952381 3、贝叶斯算法 1 2 3 4 5 6 from sklearn.naive_bayes import GaussianNB model_bayes = GaussianNB() scores = cross_val_score(model_bayes, train_X, train_y, scoring="accuracy", cv=10) print(scores.mean()) # 0.96 bayes_cv = scores 4、支持向量机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from sklearn.svm import SVC model_svm = SVC() param_grid = [ {'C' : [0.01, 0.1, 1, 10, 100], 'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma' : ['scale','auto'] } ] grid_search = GridSearchCV(model_svm, param_grid, cv=5, scoring="accuracy", n_jobs=1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) svm_grid_search = grid_search # {'C': 100, 'gamma': 'scale', 'kernel': 'linear'} # 0.980952380952381 5、随机森林 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.ensemble import RandomForestClassifier model_rf = RandomForestClassifier() param_grid = [ {'n_estimators' : [100, 200, 300, 500, 1000], 'criterion' : ['gini', 'entropy'], 'max_depth' : [4, 8, 16, 32], 'max_features' : ["sqrt", "log2"] } ] grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) rf_grid_search = grid_search # {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500} # 0.9523809523809523 6、梯度增加机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.ensemble import GradientBoostingClassifier model_gbm = GradientBoostingClassifier() param_grid = [ {'learning_rate' : [0.001, 0.01, 0.1], 'n_estimators' : [100, 200, 300, 500, 1000], 'subsample' : [0.5, 0.7, 1], 'criterion' : ["friedman_mse", "squared_error"] } ] grid_search = GridSearchCV(model_gbm, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) gbm_grid_search = grid_search # {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5} # 0.9619047619047618 7、XGBoost 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from xgboost import XGBClassifier model_xgb = XGBClassifier() param_grid = [ {'n_estimators' : [10, 30, 50], 'learning_rate' : [0.01, 0.1], 'subsample' : [0.5, 0.7, 1], 'colsample_bytree' : [0.5, 0.7, 1] } ] grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) xgb_grid_search = grid_search # {'colsample_bytree': 1, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5} # 0.9619047619047618 简单比较 1 2 3 4 5 6 7 8 9 10 import pandas as pd pd.DataFrame({ "KNN" : knn_grid_search.best_score_, "Logistic" : logistic_grid_search.best_score_, "Bayes" : bayes_cv.mean(), "SVM" : svm_grid_search.best_score_, "RF" : rf_grid_search.best_score_, "GBM" : gbm_grid_search.best_score_, "XGB" : xgb_grid_search.best_score_ }, index=["score"]).T.plot.line() ...