一、数据预处理

1. 数据拆分

  • sklearn.model_selection.train_test_split()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd

wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))

#train_test_split()第一个参数是一个数据集或者多个数据集(行数相同)
#数据集类型可以是pandas表格,numpy数组,甚至list列表
train_X, test_X = train_test_split(feats_df, 
								   test_size=0.2, random_state=42)
train_X.shape, test_X.shape
# ((142, 13), (36, 13))

train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
# (142, 13), (36, 13), (142, 1), (36, 1))

##设置stratify参数,对某一分类文本列进行分层拆分数据集
train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42,
													stratify =targets)
pd.DataFrame(train_y).value_counts(normalize=True)
pd.DataFrame(test_y).value_counts(normalize=True)

2. 数据转换

fit–transform流程,一般返回数组

2.1 数值变量标准化

1
feats_df_num = feats_df.loc[:,feats_df.dtypes=="float"]
  • 均值为0,方差为1的标准化
1
2
3
4
5
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit_transform(feats_df_num)

StandardScaler().fit_transform(feats_df_num)
  • [0,1]的归一化
1
2
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler().fit_transform(feats_df_num)

2.2 分类变量编码

1
2
3
cate_df = pd.DataFrame({"feat-1":["A","A","B","C"],
					    "feat-2":["Red","Blue","Pink","Black"]})
target_df = pd.DataFrame({"target":["survive","dead","dead","survive","unknown"]})

标签编码

OrdinalEncoder针对二维数组,所以常用于特征编码

1
2
3
4
5
6
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(cate_df)
# array([[0., 3.],
#        [0., 1.],
#        [1., 2.],
#        [2., 0.]])

LabelEncoder针对一维数组,所以常用于目标编码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from sklearn.preprocessing import LabelEncoder
LabelEncoder().fit_transform(target_df.values.ravel())
# array([1, 0, 0, 1, 2]) 

##将标签返回原始类别
X = LabelEncoder().fit(target_df.values.ravel())
Xlabel = X.transform(target_df.values.ravel())
# 还原
X.inverse_transform(Xlabel)
# array(['survive', 'dead', 'dead', 'survive', 'unknown'], dtype=object)

独热编码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder().fit_transform(cate_df).toarray()
# pd.get_dummies(cate_df)

OneHotEncoder().fit_transform(target_df).toarray()
# array([[0., 1., 0.],
#        [1., 0., 0.],
#        [1., 0., 0.],
#        [0., 1., 0.],
#        [0., 0., 1.]])
X = OneHotEncoder().fit(target_df)
Xlabel = X.transform(target_df).toarray()
# 还原
X.inverse_transform(Xlabel)
# array([['survive'],
#        ['dead'],
#        ['dead'],
#        ['survive'],
#        ['unknown']], dtype=object)

2.3 NA值处理

1
2
3
4
5
6
7
8
9
feats_df_num = feats_df.loc[:,feats_df.dtypes=="float"]
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")  # "mean", "median", "most_frequent", "constant"

imputer.fit(feats_df_num)
imputer.statistics_
imputer.transform(feats_df_num)  #返回数组
# 一步到位
imputer.fit_transform(feats_df_num)

二、建模基础

如下以红酒数据集进行决策树分类预测为例,演示建模的一些基础操作。(树相关的决策树算法不需要数据预处理。)

1
2
3
4
5
6
7
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
  • 加载、拆分数据
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))
feats_df.shape, targets.shape

train_X, test_X, train_y, test_y = train_test_split(feats_df, targets, 
													test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
  • fit–predict–score三步走
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from sklearn.tree import DecisionTreeClassifier
tree_classif = DecisionTreeClassifier() #模型实例化

tree_classif.fit(train_X, train_y)

tree_classif.predict(train_X)

tree_classif.score(train_X, train_y)

tree_classif.get_params()
  • 交叉验证

https://scikit-learn.org/stable/modules/model_evaluation.html#function-for-prediction-error-metrics

1
2
3
scores = cross_val_score(tree_classif, train_X, train_y, scoring="accuracy", cv=10)
scores.mean(), scores.std()
# (0.8857142857142858, 0.09689042833036099)
  • 超参数优化
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
param_grid = {"max_depth": [3, 5, 10],
              "min_samples_split": [2, 4],
              "min_samples_leaf": [1, 3]}
grid_search = GridSearchCV(tree_classif, param_grid, cv=5,
                           scoring="accuracy")

grid_search.fit(train_X, train_y)
grid_search.best_params_
grid_search.best_score_
grid_search.cv_results_["mean_test_score"]

best_tree_classif = grid_search.best_estimator_
best_tree_classif.score(test_X, test_y)
  • 保存、加载模型
1
2
3
4
import joblib
joblib.dump(best_tree_classif, "test_model.pkl")

my_model = joblib.load("test_model.pkl")

三、关于分类问题

机器学习可分为有监督与无监督两大类,而有监督又可分为回归与分类问题。

其中分类问题又可以细分为二分类多分类问题。

Multiclass Classification - Atmosera

3.1 二分类示例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from sklearn.datasets import load_breast_cancer
from collections import Counter
from sklearn.model_selection import train_test_split

bc_set = load_breast_cancer()
Counter(bc_set.target)
# Counter({0: 212, 1: 357})
X_train, X_test, y_train, y_test = train_test_split(bc_set.data, bc_set.target, 
													test_size=0.2, random_state=42)

from sklearn.linear_model import SGDClassifier
sgd_clf.fit(X_train, y_train)
y_test_predict = sgd_clf.predict(X_test)

(1)二分类评价指标

  • 相关二分类评价指标含义:https://lishensuo.github.io/posts/basic/005%E7%BB%9F%E8%AE%A1%E5%AD%A6%E5%9F%BA%E7%A1%80–%E4%BA%8C%E5%88%86%E7%B1%BB%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BB%B7%E6%8C%87%E6%A0%87/
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

confusion_matrix(y_test, y_test_predict)

precision_score(y_test, y_test_predict)

recall_score(y_test, y_test_predict)

f1_score(y_test, y_test_predict)

roc_auc_score(y_test, y_test_predict)

average_precision_score(y_test, y_test_predict)

#绘制ROC、AUPR曲线
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve
plot_roc_curve(sgd_clf, X_test, y_test)
plot_precision_recall_curve(sgd_clf, X_test, y_test)

(2)类别不均衡

例如在数据集中阳性标签样本数远远低于阴性标签数,此时可以提高阳性样本的权重,降低阴性样本的权重

1
2
3
4
5
6
#方法1:在实例化模型时,为每个类别赋予权重
sgd_clf = SGDClassifier(class_weight="balanced")

#方法2:在训练模型时,为每个样本赋予权重
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train, sample_weight=np.ones((len(y_train),)))

3.2 多分类问题

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from sklearn.datasets import load_iris
iris_set = load_iris()
Counter(iris_set.target)
# Counter({0: 50, 1: 50, 2: 50})

from sklearn.svm import SVC
svm_clf = SVC()
# 参数 decision_function_shape : {'ovo', 'ovr'}, default='ovr'

svm_clf.fit(X_train, y_train)
svm_clf.score(X_train, y_train)
svm_clf.predict(X_test)

# from sklearn.multiclass import OneVsOneClassifier
# svm_clf = OneVsOneClassifier(SVC())