在處理序列數據集時,有時會遇到變長度的樣本。此時因為尺寸不一致,無法直接利用pytorch中dataloader的默認加載方式(沿着批維度直接Stack)。
處理這種數據集,一種辦法是可以事先記錄每個樣本的長度,並把所有的數據集樣本補全至最長的樣本長度,這樣所有樣本長度一致,可以直接加載。但是會有一個問題,就是例如在使用RNN建模時,這些padding的0值會對模型造成額外影響.參考這篇文章。
pytorch中通過函數torch.nn.utils.rnn.pack_padded_sequence()以及torch.nn.utils.rnn.pad_packed_sequence()來解決這個問題。torch.nn.utils.rnn.pack_padded_sequence()通過利用
pad之后的樣本和每個原始序列的長度對補全后的樣本進行pack。這樣RNN模型在計算時,根據原來的樣本長度就知道每個樣本在何時結束,從而避免額外的pad的0值的影響。計算完之后通過torch.nn.utils.rnn.pad_packed_sequence()將輸出的格式轉換為pack之前的格式。
collate_fn
另一種辦法是通過自定義collate_fn,並將其傳入DataLoader,從而實現自定義的批數據聚合方式。這里給出一些示例。
這篇文章給出了一種解決思路
示例1
問題背景
想要使用pytorch 框架中的 Dataset 和 Dataloader 類,將變長序列整合為batch數據 (主要是對長短不一的序列進行補齊),通過自定義collate_fn函數,實現對變長數據的處理。
主要思路
Dataset 主要負責讀取單條數據,建立索引方式。
Dataloader 負責將數據聚合為batch。
測試環境: python 3.6 ,pytorch 1.2.0
數據路徑:

data路徑下存儲的是待存儲的數據樣本。
舉例:其中的 1.json 樣本格式為:
![]()
定義數據集class,進行數據索引
數據集class定義代碼:
1 import os 2 import numpy as np 3 import torch 4 from torch.utils.data import Dataset 5 from tqdm import tqdm 6 class time_series_dataset(Dataset): 7 def __init__(self, data_root): 8 """ 9 :param data_root: 數據集路徑 10 """ 11 self.data_root = data_root 12 file_list = os.listdir(data_root) 13 file_prefix = [] 14 for file in file_list: 15 if '.json' in file: 16 file_prefix.append(file.split('.')[0]) 17 file_prefix = list(set(file_prefix)) 18 self.data = file_prefix 19 def __len__(self): 20 return len(self.data) 21 def __getitem__(self, index): 22 prefix = self.data[index] 23 import json 24 with open(self.data_root+prefix+'.json','r',encoding='utf-8') as f: 25 data_dic=json.load(f) 26 feature = np.array(data_dic['feature']) 27 length=len(data_dic['feature']) 28 feature = torch.from_numpy(feature) 29 label = np.array(data_dic['label']) 30 label = torch.from_numpy(label) 31 sample = {'feature': feature, 'label': label, 'id': prefix,'length':length} 32 return sample
這里dataset將每個樣本的數據,標簽、以及每個樣本的長度都包裹在一個字典里並返回。
數據集實例化:
1 dataset = time_series_dataset("./data/") # "./data/" 為數據集文件存儲路徑
基於此數據集的實際數據格式如下:
舉例: dataset[0]
1 {'feature': tensor([17, 14, 16, 18, 14, 16], dtype=torch.int32), 2 'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3 0], dtype=torch.int32), 4 'id': '2', 5 'length': 6}
定義collate_fn函數,傳入Dataloader類
自定義collate_fn代碼
1 from torch.nn.utils.rnn import pad_sequence 2 3 def collate_func(batch_dic): 4 batch_len=len(batch_dic) # 批尺寸 5 max_seq_length=max([dic['length'] for dic in batch_dic]) # 一批數據中最長的那個樣本長度 6 mask_batch=torch.zeros((batch_len,max_seq_length)) # mask 7 fea_batch=[] 8 label_batch=[] 9 id_batch=[] 10 for i in range(len(batch_dic)): # 分別提取批樣本中的feature、label、id、length信息 11 dic=batch_dic[i] 12 fea_batch.append(dic['feature']) 13 label_batch.append(dic['label']) 14 id_batch.append(dic['id']) 15 mask_batch[i,:dic['length']]=1 # mask 16 res={} 17 res['feature']=pad_sequence(fea_batch,batch_first=True) # 將信息封裝在字典res中 18 res['label']=pad_sequence(label_batch,batch_first=True) 19 res['id']=id_batch 20 res['mask']=mask_batch 21 return res
pytorch中的dataloader返回的是一個list,也即collate_func的輸入是一個列表。
說明: mask 字段用以存儲變長序列的實際長度,補零的部分記為0,實際序列對應位置記為1。返回數據的格式及包含的字段,根據自己的需求進行定義。
這一段似乎用映射map更合適:
1 for i in range(len(batch_dic)): 2 dic=batch_dic[i] 3 fea_batch.append(dic['feature']) 4 label_batch.append(dic['label']) 5 id_batch.append(dic['id']) 6 mask_batch[i,:dic['length']]=1
1 fea_batch = list(map(lambda x: x['feature'], batch_dic)) 2 label_batch = list(map(lambda x: x['label'], batch_dic)) 3 id_batch = list(map(lambda x: x['id'], batch_dic))
Dataloader實例化調用代碼:
1 train_loader = DataLoader(dataset, batch_size=3, num_workers=1, shuffle=True,collate_fn=collate_func)
完整流程代碼
1 import os 2 import numpy as np 3 import torch 4 from torch.utils.data import Dataset 5 from torch.utils.data import DataLoader 6 from tqdm import tqdm 7 class time_series_dataset(Dataset): 8 def __init__(self, data_root): 9 """ 10 :param data_root: 數據集路徑 11 """ 12 self.data_root = data_root 13 file_list = os.listdir(data_root) 14 file_prefix = [] 15 for file in file_list: 16 if '.json' in file: 17 file_prefix.append(file.split('.')[0]) 18 file_prefix = list(set(file_prefix)) 19 self.data = file_prefix 20 def __len__(self): 21 return len(self.data) 22 def __getitem__(self, index): 23 prefix = self.data[index] 24 import json 25 with open(self.data_root+prefix+'.json','r',encoding='utf-8') as f: 26 data_dic=json.load(f) 27 feature = np.array(data_dic['feature']) 28 length=len(data_dic['feature']) 29 feature = torch.from_numpy(feature) 30 label = np.array(data_dic['label']) 31 label = torch.from_numpy(label) 32 sample = {'feature': feature, 'label': label, 'id': prefix,'length':length} 33 return sample 34 def collate_func(batch_dic): 35 #from torch.nn.utils.rnn import pad_sequence 36 batch_len=len(batch_dic) 37 max_seq_length=max([dic['length'] for dic in batch_dic]) 38 mask_batch=torch.zeros((batch_len,max_seq_length)) 39 fea_batch=[] 40 label_batch=[] 41 id_batch=[] 42 for i in range(len(batch_dic)): 43 dic=batch_dic[i] 44 fea_batch.append(dic['feature']) 45 label_batch.append(dic['label']) 46 id_batch.append(dic['id']) 47 mask_batch[i,:dic['length']]=1 48 res={} 49 res['feature']=pad_sequence(fea_batch,batch_first=True) 50 res['label']=pad_sequence(label_batch,batch_first=True) 51 res['id']=id_batch 52 res['mask']=mask_batch 53 return res 54 if __name__ == "__main__": 55 dataset = time_series_dataset("./data/") 56 batch_size=3 57 train_loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=True,collate_fn=collate_func) 58 for batch_idx, batch in tqdm(enumerate(train_loader),total=int(len(train_loader.dataset) / batch_size) + 1): 59 inputs,labels,masks,ids=batch['feature'],batch['label'],batch['mask'],batch['id'] 60 break
以上代碼僅為參考,非最佳實踐。
示例2
1 from torch.nn.utils.rnn import pack_sequence 2 from torch.utils.data import DataLoader 3 4 def my_collate(batch): 5 # batch contains a list of tuples of structure (sequence, target) 6 data = [item[0] for item in batch] 7 data = pack_sequence(data, enforce_sorted=False) 8 targets = [item[1] for item in batch] 9 return [data, targets] 10 11 # ... 12 # later in you code, when you define you DataLoader - use the custom collate function 13 loader = DataLoader(dataset, 14 batch_size, 15 shuffle, 16 collate_fn=my_collate, # use custom collate function here 17 pin_memory=True)
示例3
沿一般的維度填充
I wrote a simple code that maybe someone here can re-use. I wanted to make something that pads a generic dim, and I don’t use an RNN of any type so PackedSequence was a bit of overkill for me. It’s simple, but it works for me.
1 def pad_tensor(vec, pad, dim): 2 """ 3 args: 4 vec - tensor to pad 5 pad - the size to pad to 6 dim - dimension to pad 7 8 return: 9 a new tensor padded to 'pad' in dimension 'dim' 10 """ 11 pad_size = list(vec.shape) 12 pad_size[dim] = pad - vec.size(dim) 13 return torch.cat([vec, torch.zeros(*pad_size)], dim=dim) 14 15 16 class PadCollate: 17 """ 18 a variant of callate_fn that pads according to the longest sequence in 19 a batch of sequences 20 """ 21 22 def __init__(self, dim=0): 23 """ 24 args: 25 dim - the dimension to be padded (dimension of time in sequences) 26 """ 27 self.dim = dim 28 29 def pad_collate(self, batch): 30 """ 31 args: 32 batch - list of (tensor, label) 33 34 reutrn: 35 xs - a tensor of all examples in 'batch' after padding 36 ys - a LongTensor of all labels in batch 37 """ 38 # find longest sequence 39 max_len = max(map(lambda x: x[0].shape[self.dim], batch)) 40 # pad according to max_len 41 batch = map(lambda (x, y): 42 (pad_tensor(x, pad=max_len, dim=self.dim), y), batch) 43 # stack all 44 xs = torch.stack(map(lambda x: x[0], batch), dim=0) 45 ys = torch.LongTensor(map(lambda x: x[1], batch)) 46 return xs, ys 47 48 def __call__(self, batch): 49 return self.pad_collate(batch)
to be used with the data loader:
1 train_loader = DataLoader(ds, ..., collate_fn=PadCollate(dim=0))
示例4
If you are going to pack your padded sequences later, you can also immediately sort the batches from longest sequence to shortest:
如果你打算后續對padded的樣本進行pack操作,你可以對批樣本從長到短進行排序:(這種做法是比較實用的,因為通常后續需要進行pack操作)
1 def sort_batch(batch, targets, lengths): 2 """ 3 Sort a minibatch by the length of the sequences with the longest sequences first 4 return the sorted batch targes and sequence lengths. 5 This way the output can be used by pack_padded_sequences(...) 6 """ 7 seq_lengths, perm_idx = lengths.sort(0, descending=True) 8 seq_tensor = batch[perm_idx] 9 target_tensor = targets[perm_idx] 10 return seq_tensor, target_tensor, seq_lengths 11 12 def pad_and_sort_batch(DataLoaderBatch): 13 """ 14 DataLoaderBatch should be a list of (sequence, target, length) tuples... 15 Returns a padded tensor of sequences sorted from longest to shortest, 16 """ 17 batch_size = len(DataLoaderBatch) 18 batch_split = list(zip(*DataLoaderBatch)) 19 20 seqs, targs, lengths = batch_split[0], batch_split[1], batch_split[2] 21 max_length = max(lengths) 22 23 padded_seqs = np.zeros((batch_size, max_length)) 24 for i, l in enumerate(lengths): 25 padded_seqs[i, 0:l] = seqs[i][0:l] 26 27 return sort_batch(torch.tensor(padded_seqs), torch.tensor(targs).view(-1,1), torch.tensor(lengths))
假設你的Dataset具有以下形式:
1 def __getitem__(self, idx): 2 return self.sequences[idx], torch.tensor(self.targets[idx]), self.sequence_lengths[idx]
使用時將pad_and_sort collator傳到 DataLoader:
1 train_gen = Data.DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=pad_and_sort_batch)
示例5
1 def collate_fn_padd(batch): 2 ''' 3 Padds batch of variable length 4 5 note: it converts things ToTensor manually here since the ToTensor transform 6 assume it takes in images rather than arbitrary tensors. 7 ''' 8 ## get sequence lengths 9 lengths = torch.tensor([ t.shape[0] for t in batch ]).to(device) 10 ## padd 11 batch = [ torch.Tensor(t).to(device) for t in batch ] 12 batch = torch.nn.utils.rnn.pad_sequence(batch) 13 ## compute mask 14 mask = (batch != 0).to(device) 15 return batch, lengths, mask
參考:
https://blog.csdn.net/lrs1353281004/article/details/106129660
https://discuss.pytorch.org/t/dataloader-for-various-length-of-data/6418
