https://huggingface.co/docs/datasets/index

A Dataset provides fast random access to the rows, and memory-mapping so that loading even large datasets only uses a relatively small amount of device memory.

But for really, really big datasets ( > 100G) that won’t even fit on disk or in memory, an IterableDataset allows you to access and use the dataset without waiting for it to download completely!

1. 读取

1.1 以Json文件读取为例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
## 预先下载解压到本地的datasets文件夹
# wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
# wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
# gzip -dkv SQuAD_it-*.json.gz

from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="./datasets/SQuAD_it-train.json", field="data")
# field参数为JSON文件特有,用于指定 JSON 文件中包含实际数据的字段名
squad_it_dataset # 默认读取为train split
squad_it_dataset.keys()
# dict_keys(['train'])
squad_it_dataset
# DatasetDict({
#     train: Dataset({
#         features: ['title', 'paragraphs'],
#         num_rows: 442
#     })
# })

1.2 JSON的其它形式读取

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
# (1) 同时读取两个split
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

# (2) 读取gz压缩格式
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

# (3) 远程读取
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

data_files可以为每个split指定多个文件,https://huggingface.co/docs/datasets/loading

1.3 表格基本操作

1
2
3
4
5
6
7
8
squad_it_dataset["train"].column_names #查看列名
squad_it_dataset.features

squad_it_dataset["train"].features     #查看每列的详细信息

squad_it_dataset["train"][0]           #第一行

squad_it_dataset["train"]["title"]	   #title列

此外还支持,CSV/Parquet/Arrow/SQL等文件格式

  • CVS泛指表格类文件,可以设置分隔符,例如tsv
  • Parquet: Large datasets may be stored in a Parquet file because it is more efficient and faster at returning your query.
  • Arrow: Datasets库使用的方式

也支持字典Dict,Pandas表格,Generator等

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
from datasets import Dataset
## 字典
my_list = [{"a": 1}, {"a": 2}, {"a": 3}]
dataset = Dataset.from_list(my_list)
# 等价于
my_list = [{"a": 1}, {"a": 2}, {"a": 3}]
dataset = Dataset.from_list(my_list)

## Pandas
import pandas as pd
df = pd.DataFrame({"a": [1, 2, 3]})
dataset = Dataset.from_pandas(df)

2. 处理

以一个CSV文件为例

1
2
3
4
5
6
7
8
## 预先下载解压到本地的datasets文件夹
# wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# unzip drugsCom_raw.zip

from datasets import load_dataset

data_files = {"train": "datasets/drugsComTrain_raw.tsv", "test": "datasets/drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

2.1 行操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
## 基本查看
drug_dataset["train"][:3]
drug_dataset["train"].select(range(1000)) # by index

## 乱序
drug_dataset["train"].shuffle(seed=42)

## 过滤行
drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset.filter(lambda x: x["review_length"] > 30)
drug_dataset.filter(lambda x: x["sentence1"].startswith("Ar"))
dataset.filter(lambda example, idx: idx % 2 == 0, with_indices=True)

## 排序
drug_dataset["train"].sort("review_length")[:3]

2.2 列操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
drug_dataset['train']

for col1, col2 in zip(drug_dataset['train']['drugName'][:5], drug_dataset['train']['rating'][:5]):
    print(f"drugName: {col1}, rating: {col2}")

## 删除列
drug_dataset.remove_columns(["drugName", "rating"])
## 选择列
drug_dataset.select_columns(['sentence1', 'sentence2', 'idx'])
    
## 修改列名
drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

## 修改列内容(.map)
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}
drug_dataset.map(lowercase_condition)

## 新增列
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}
drug_dataset = drug_dataset.map(compute_review_length)

默认情况下,map() 对每个样本单独调用该函数。如果设置 batched=True,则函数会接收一批样本作为输入,这样可以进行批量加速处理。

1
2
3
4
5
6
7
8
9
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./datasets/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)
# 默认batch_size为1000

2.3 设置输出格式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# only changes the *output format* of the dataset
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

# 此时,可以转换为pandas.DataFrame,执行相关操作
train_df = drug_dataset["train"][:]
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

# from datasets import Dataset
# freq_dataset = Dataset.from_pandas(frequencies)

# reset from 'pandas' to 'arrow'
drug_dataset.reset_format()

# 设置为torch tensor
drug_dataset.set_format("torch")
drug_dataset["train"][:1]

2.4 Split拆分

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean.keys()
# dict_keys(['train', 'test'])

drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean.keys()
# dict_keys(['train', 'validation'])

drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean.keys()
# dict_keys(['train', 'validation', 'test'])

3. 保存

3.1 保存为arrow格式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
drug_dataset_clean.save_to_disk("datasets/demo-drug-reviews")
# demo-drug-reviews/
# ├── dataset_dict.json
# ├── test
# │   ├── dataset.arrow
# │   ├── dataset_info.json
# │   └── state.json
# ├── train
# │   ├── dataset.arrow
# │   ├── dataset_info.json
# │   ├── indices.arrow
# │   └── state.json
# └── validation
#     ├── dataset.arrow
#     ├── dataset_info.json
#     ├── indices.arrow
#     └── state.json

# 加载
from datasets import load_from_disk
drug_dataset_reloaded = load_from_disk("datasets/demo-drug-reviews")
drug_dataset_reloaded

3.2 其它格式

  • 对于json等其它格式(csv, parquet, sql),需要分别保存每个split
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# 保存
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")
    
# 加载
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)