机器学习 | Li's Bioinfo-Blog

机器学习基于sklearn(3)--常见分类任务学习器

目的：演示常见几种分类器的使用方法，对其超参数调优候选超参数的选择 0、示例数据 1 2 3 4 5 6 7 8 9 10 11 12 from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV iris = datasets.load_iris() iris_X, iris_y = iris.data, iris.target iris_X = StandardScaler().fit_transform(iris_X) train_X, test_X, train_y, test_y = train_test_split(iris_X, iris_y, test_size=0.3) train_X.shape, test_X.shape, train_y.shape, test_y.shape # ((105, 4), (45, 4), (105,), (45,)) 1、K近邻 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from sklearn.neighbors import KNeighborsClassifier model_knn = KNeighborsClassifier() param_grid = {"n_neighbors": [3, 5, 7, 10, 20], "p": [1, 2], "weights": ["uniform", "distance"]} grid_search = GridSearchCV(model_knn, param_grid, cv=5, scoring="accuracy", n_jobs=-1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) knn_grid_search = grid_search # {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'} # 0.9714285714285715 2、逻辑回归 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.linear_model import LogisticRegression model_logistic = LogisticRegression() param_grid = [ {'penalty' : ['l1', 'l2', 'elasticnet', 'none'], 'C' : [0.01, 0.1, 1, 10, 100], 'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'], 'max_iter' : [1000] } ] grid_search = GridSearchCV(model_logistic, param_grid, cv=5, scoring="accuracy", n_jobs=1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) logistic_grid_search = grid_search # {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'} # 0.980952380952381 3、贝叶斯算法 1 2 3 4 5 6 from sklearn.naive_bayes import GaussianNB model_bayes = GaussianNB() scores = cross_val_score(model_bayes, train_X, train_y, scoring="accuracy", cv=10) print(scores.mean()) # 0.96 bayes_cv = scores 4、支持向量机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from sklearn.svm import SVC model_svm = SVC() param_grid = [ {'C' : [0.01, 0.1, 1, 10, 100], 'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma' : ['scale','auto'] } ] grid_search = GridSearchCV(model_svm, param_grid, cv=5, scoring="accuracy", n_jobs=1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) svm_grid_search = grid_search # {'C': 100, 'gamma': 'scale', 'kernel': 'linear'} # 0.980952380952381 5、随机森林 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.ensemble import RandomForestClassifier model_rf = RandomForestClassifier() param_grid = [ {'n_estimators' : [100, 200, 300, 500, 1000], 'criterion' : ['gini', 'entropy'], 'max_depth' : [4, 8, 16, 32], 'max_features' : ["sqrt", "log2"] } ] grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) rf_grid_search = grid_search # {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500} # 0.9523809523809523 6、梯度增加机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from sklearn.ensemble import GradientBoostingClassifier model_gbm = GradientBoostingClassifier() param_grid = [ {'learning_rate' : [0.001, 0.01, 0.1], 'n_estimators' : [100, 200, 300, 500, 1000], 'subsample' : [0.5, 0.7, 1], 'criterion' : ["friedman_mse", "squared_error"] } ] grid_search = GridSearchCV(model_gbm, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) gbm_grid_search = grid_search # {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5} # 0.9619047619047618 7、XGBoost 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from xgboost import XGBClassifier model_xgb = XGBClassifier() param_grid = [ {'n_estimators' : [10, 30, 50], 'learning_rate' : [0.01, 0.1], 'subsample' : [0.5, 0.7, 1], 'colsample_bytree' : [0.5, 0.7, 1] } ] grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring="accuracy", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) xgb_grid_search = grid_search # {'colsample_bytree': 1, 'learning_rate': 0.01, 'n_estimators': 10, 'subsample': 0.5} # 0.9619047619047618 简单比较 1 2 3 4 5 6 7 8 9 10 import pandas as pd pd.DataFrame({ "KNN" : knn_grid_search.best_score_, "Logistic" : logistic_grid_search.best_score_, "Bayes" : bayes_cv.mean(), "SVM" : svm_grid_search.best_score_, "RF" : rf_grid_search.best_score_, "GBM" : gbm_grid_search.best_score_, "XGB" : xgb_grid_search.best_score_ }, index=["score"]).T.plot.line() ...

机器学习基于sklearn(4)--常见回归任务学习器

目的：演示常见几种回归器的使用方法，对其超参数调优候选超参数的选择 0、示例数据 1 2 3 4 5 6 7 8 9 10 11 12 from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV diabetes = datasets.load_diabetes() diabetes_X, diabetes_y = diabetes.data, diabetes.target diabetes_X = StandardScaler().fit_transform(diabetes_X) train_X, test_X, train_y, test_y = train_test_split(diabetes_X, diabetes_y, test_size=0.3) train_X.shape, test_X.shape, train_y.shape, test_y.shape # ((309, 10), (133, 10), (309,), (133,)) 1、K近邻 1 2 3 4 5 6 7 8 9 10 11 12 13 from sklearn.neighbors import KNeighborsRegressor model_knn = KNeighborsRegressor() param_grid = {"n_neighbors": [3, 5, 7, 10, 20], "p": [1, 2], "weights": ["uniform", "distance"]} grid_search = GridSearchCV(model_knn, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) knn_grid_search = grid_search # {'n_neighbors': 10, 'p': 2, 'weights': 'distance'} # -58.18148180421127 2、线性回归 1 2 3 4 5 6 7 from sklearn.linear_model import LinearRegression model_linear = LinearRegression() scores = cross_val_score(model_linear, train_X, train_y, scoring="neg_root_mean_squared_error", cv=10) print(scores.mean()) linear_cv = scores # -56.297486183914245 3、支持向量机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 from sklearn.svm import SVR model_svm = SVR() param_grid = [ {'C' : [0.01, 0.1, 1, 10, 100], 'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma' : ['scale','auto'] } ] grid_search = GridSearchCV(model_svm, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=1) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) svm_grid_search = grid_search # {'C': 1, 'gamma': 'scale', 'kernel': 'linear'} # -56.48299266830155 4、随机森林 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ## 随机森林 from sklearn.ensemble import RandomForestRegressor model_rf = RandomForestRegressor() param_grid = [ {'n_estimators' : [100, 200, 300, 500, 1000], 'criterion' : ["squared_error", "absolute_error"], #与分类任务有变化 'max_depth' : [4, 8, 16, 32], 'max_features' : ["sqrt", "log2"] } ] grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) rf_grid_search = grid_search # {'criterion': 'squared_error', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 500} # -56.98892728898064 5、梯度增加机 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 from sklearn.ensemble import GradientBoostingRegressor model_gbm = GradientBoostingRegressor() param_grid = [ {'loss' : ['squared_error', 'absolute_error', 'huber', 'quantile'], #与分类任务有变化 'learning_rate' : [0.001, 0.01, 0.1], 'n_estimators' : [100, 200, 300, 500], 'subsample' : [0.5, 0.7, 1] } ] grid_search = GridSearchCV(model_gbm, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) gbm_grid_search = grid_search # {'learning_rate': 0.01, 'loss': 'absolute_error', 'n_estimators': 500, 'subsample': 0.5} # -57.07526918837941 6、XGBoost 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 from xgboost import XGBRegressor model_xgb = XGBRegressor() param_grid = [ {'n_estimators' : [10, 30, 50], 'learning_rate' : [0.01, 0.1], 'subsample' : [0.5, 0.7, 1], 'colsample_bytree' : [0.5, 0.7, 1] } ] grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=10) grid_search.fit(train_X, train_y) print(grid_search.best_params_) print(grid_search.best_score_) xgb_grid_search = grid_search # {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 50, 'subsample': 0.5} # -58.57262307535045 简单比较 1 2 3 4 5 6 7 8 9 import pandas as pd pd.DataFrame({ "KNN" : knn_grid_search.best_score_, "Linear" : linear_cv.mean(), "SVM" : svm_grid_search.best_score_, "RF" : rf_grid_search.best_score_, "GBM" : gbm_grid_search.best_score_, "XGB" : xgb_grid_search.best_score_ }, index=["score"]).T.plot.line() ...

机器学习--自动机器学习工具autogluon

最先在李沐大神在B站的分享中了解到autogluon，它是一个自动机器学习工具，可用于文本图片识别、表格任务等。据说效果非常不错–号称3行代码打败99%的机器学习模型，甚至说标志着手动调参的时代已经结束。 ...