一、数据预处理

1. 数据拆分

sklearn.model_selection.train_test_split()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd

wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))

#train_test_split()第一个参数是一个数据集或者多个数据集(行数相同)
#数据集类型可以是pandas表格，numpy数组，甚至list列表
train_X, test_X = train_test_split(feats_df, 
								   test_size=0.2, random_state=42)
train_X.shape, test_X.shape
# ((142, 13), (36, 13))

train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
# (142, 13), (36, 13), (142, 1), (36, 1))

##设置stratify参数，对某一分类文本列进行分层拆分数据集
train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42,
													stratify =targets)
pd.DataFrame(train_y).value_counts(normalize=True)
pd.DataFrame(test_y).value_counts(normalize=True)

2. 数据转换

fit–transform流程，一般返回数组

2.1 数值变量标准化

1

feats_df_num = feats_df.loc[:,feats_df.dtypes=="float"]

均值为0，方差为1的标准化

1
2
3
4
5


from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit_transform(feats_df_num)

StandardScaler().fit_transform(feats_df_num)

[0,1]的归一化

1
2


from sklearn.preprocessing import MinMaxScaler
MinMaxScaler().fit_transform(feats_df_num)

2.2 分类变量编码

1
2
3


cate_df = pd.DataFrame({"feat-1":["A","A","B","C"],
					    "feat-2":["Red","Blue","Pink","Black"]})
target_df = pd.DataFrame({"target":["survive","dead","dead","survive","unknown"]})

标签编码

OrdinalEncoder针对二维数组，所以常用于特征编码

1
2
3
4
5
6


from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(cate_df)
# array([[0., 3.],
#        [0., 1.],
#        [1., 2.],
#        [2., 0.]])

LabelEncoder针对一维数组，所以常用于目标编码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


from sklearn.preprocessing import LabelEncoder
LabelEncoder().fit_transform(target_df.values.ravel())
# array([1, 0, 0, 1, 2]) 

##将标签返回原始类别
X = LabelEncoder().fit(target_df.values.ravel())
Xlabel = X.transform(target_df.values.ravel())
# 还原
X.inverse_transform(Xlabel)
# array(['survive', 'dead', 'dead', 'survive', 'unknown'], dtype=object)

独热编码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


from sklearn.preprocessing import OneHotEncoder
OneHotEncoder().fit_transform(cate_df).toarray()
# pd.get_dummies(cate_df)

OneHotEncoder().fit_transform(target_df).toarray()
# array([[0., 1., 0.],
#        [1., 0., 0.],
#        [1., 0., 0.],
#        [0., 1., 0.],
#        [0., 0., 1.]])
X = OneHotEncoder().fit(target_df)
Xlabel = X.transform(target_df).toarray()
# 还原
X.inverse_transform(Xlabel)
# array([['survive'],
#        ['dead'],
#        ['dead'],
#        ['survive'],
#        ['unknown']], dtype=object)

2.3 NA值处理

1
2
3
4
5
6
7
8
9


feats_df_num = feats_df.loc[:,feats_df.dtypes=="float"]
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")  # "mean", "median", "most_frequent", "constant"

imputer.fit(feats_df_num)
imputer.statistics_
imputer.transform(feats_df_num)  #返回数组
# 一步到位
imputer.fit_transform(feats_df_num)

二、建模基础

如下以红酒数据集进行决策树分类预测为例，演示建模的一些基础操作。（树相关的决策树算法不需要数据预处理。）

1
2
3
4
5
6
7


import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

加载、拆分数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))
feats_df.shape, targets.shape

train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape

fit–predict–score三步走

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


from sklearn.tree import DecisionTreeClassifier
tree_classif = DecisionTreeClassifier() #模型实例化

tree_classif.fit(train_X, train_y)

tree_classif.predict(train_X)

tree_classif.score(train_X, train_y)

tree_classif.get_params()

交叉验证

https://scikit-learn.org/stable/modules/model_evaluation.html#function-for-prediction-error-metrics

1
2
3


scores = cross_val_score(tree_classif, train_X, train_y, scoring="accuracy", cv=10)
scores.mean(), scores.std()
# (0.8857142857142858, 0.09689042833036099)

超参数优化

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


param_grid = {"max_depth": [3, 5, 10],
              "min_samples_split": [2, 4],
              "min_samples_leaf": [1, 3]}
grid_search = GridSearchCV(tree_classif, param_grid, cv=5,
                           scoring="accuracy")

grid_search.fit(train_X, train_y)
grid_search.best_params_
grid_search.best_score_
grid_search.cv_results_["mean_test_score"]

best_tree_classif = grid_search.best_estimator_
best_tree_classif.score(test_X, test_y)

保存、加载模型

1
2
3
4


import joblib
joblib.dump(best_tree_classif, "test_model.pkl")

my_model = joblib.load("test_model.pkl")

三、关于分类问题

机器学习可分为有监督与无监督两大类，而有监督又可分为回归与分类问题。

其中分类问题又可以细分为二分类与多分类问题。

一方面：二分类问题的评价指标更加多样，例如召回率、AUC值等

https://scikit-learn.org/stable/modules/model_evaluation.html#function-for-prediction-error-metrics
另一方面：有些分类器仅可以处理二分类问题(SVM、朴素贝叶斯等)，但可通过OvO或者OvR策略间接处理多分类问题。

所以理论上sklearn分类器模型均可以处理二分类/多分类模型

3.1 二分类示例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


from sklearn.datasets import load_breast_cancer
from collections import Counter
from sklearn.model_selection import train_test_split

bc_set = load_breast_cancer()
Counter(bc_set.target)
# Counter({0: 212, 1: 357})
X_train, X_test, y_train, y_test = train_test_split(bc_set.data, bc_set.target, 
													test_size=0.2, random_state=42)

from sklearn.linear_model import SGDClassifier
sgd_clf.fit(X_train, y_train)
y_test_predict = sgd_clf.predict(X_test)

（1）二分类评价指标

相关二分类评价指标含义：https://lishensuo.github.io/posts/basic/005%E7%BB%9F%E8%AE%A1%E5%AD%A6%E5%9F%BA%E7%A1%80–%E4%BA%8C%E5%88%86%E7%B1%BB%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BB%B7%E6%8C%87%E6%A0%87/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

confusion_matrix(y_test, y_test_predict)

precision_score(y_test, y_test_predict)

recall_score(y_test, y_test_predict)

f1_score(y_test, y_test_predict)

roc_auc_score(y_test, y_test_predict)

average_precision_score(y_test, y_test_predict)

#绘制ROC、AUPR曲线
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve
plot_roc_curve(sgd_clf, X_test, y_test)
plot_precision_recall_curve(sgd_clf, X_test, y_test)

（2）类别不均衡

例如在数据集中阳性标签样本数远远低于阴性标签数，此时可以提高阳性样本的权重，降低阴性样本的权重

1
2
3
4
5
6


#方法1:在实例化模型时，为每个类别赋予权重
sgd_clf = SGDClassifier(class_weight="balanced")

#方法2：在训练模型时，为每个样本赋予权重
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train, sample_weight=np.ones((len(y_train),)))

3.2 多分类问题

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


from sklearn.datasets import load_iris
iris_set = load_iris()
Counter(iris_set.target)
# Counter({0: 50, 1: 50, 2: 50})

from sklearn.svm import SVC
svm_clf = SVC()
# 参数 decision_function_shape : {'ovo', 'ovr'}, default='ovr'

svm_clf.fit(X_train, y_train)
svm_clf.score(X_train, y_train)
svm_clf.predict(X_test)

# from sklearn.multiclass import OneVsOneClassifier
# svm_clf = OneVsOneClassifier(SVC())

一、数据预处理#

1. 数据拆分#

2. 数据转换#

2.1 数值变量标准化#

2.2 分类变量编码#

标签编码#

独热编码#

2.3 NA值处理#

二、建模基础#

三、关于分类问题#

3.1 二分类示例#

（1）二分类评价指标#

（2）类别不均衡#

3.2 多分类问题#