1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
wine = load_wine() # 字典
feats = wine["data"]
feats_name = wine["feature_names"]
feats_df = pd.DataFrame(feats, columns=feats_name)
targets = wine["target"].reshape((-1,1))
#train_test_split()第一个参数是一个数据集或者多个数据集(行数相同)
#数据集类型可以是pandas表格,numpy数组,甚至list列表
train_X, test_X = train_test_split(feats_df,
test_size=0.2, random_state=42)
train_X.shape, test_X.shape
# ((142, 13), (36, 13))
train_X, test_X, train_y, test_y = train_test_split(feats_df, targets,
test_size=0.2, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape
# (142, 13), (36, 13), (142, 1), (36, 1))
##设置stratify参数,对某一分类文本列进行分层拆分数据集
train_X, test_X, train_y, test_y = train_test_split(feats_df, targets,
test_size=0.2, random_state=42,
stratify =targets)
pd.DataFrame(train_y).value_counts(normalize=True)
pd.DataFrame(test_y).value_counts(normalize=True)
|