1. Collator数据处理 目的:将dataset的初始数据进行规范化批量处理,用以后续的前向计算 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 # Start from dataset (Sequences could have diff lengths) Dataset({ features: ['input_ids'], num_rows: 5 }) # End to encoded batch input (BatchEncoding格式) {'input_ids': tensor([[350, 241, 345, 705, 695, 1, 427, 645, 99, 943, 0, 0, 0, 0], [196, 464, 546, 626, 413, 1, 973, 98, 824, 1, 410, 0, 0, 0], [475, 665, 1, 164, 306, 788, 53, 562, 232, 216, 252, 990, 0, 0], [ 1, 966, 734, 897, 171, 357, 217, 850, 529, 895, 728, 234, 799, 0], [713, 76, 1, 428, 913, 890, 143, 992, 832, 963, 555, 18, 354, 455]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, -100, -100, 716, -100, -100, -100, -100, -100, -100, -100, -100], [-100, -100, -100, -100, -100, 665, -100, -100, -100, 686, -100, -100, -100, -100], [-100, -100, 56, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [ 619, 966, -100, -100, -100, 357, -100, -100, -100, -100, -100, -100, -100, -100], [-100, -100, 218, -100, -100, -100, -100, -100, -100, 963, -100, -100, -100, -100]])} 常见的关键字段包括:
...