Pytorch-torchtext的使用


使用torchtext的一般步驟https://www.cnblogs.com/cxq1126/p/13466998.html#_label9

1.使用torchtext默認支持的預訓練詞向量

默認情況下,會自動下載對應的預訓練詞向量文件到當前文件夾下的.vector_cache目錄下,.vector_cache為默認的詞向量文件和緩存文件的目錄。

1 from torchtext.vocab import GloVe
2 from torchtext import data
3 TEXT = data.Field(sequential=True)
4 
5 TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
6 TEXT.build_vocab(train, vectors="glove.6B.300d")

2.使用外部預訓練好的詞向量

從網站中(https://github.com/Embedding/Chinese-Word-Vectors)下載中文詞向量sgns.wiki.word

通過name參數可以指定預訓練詞向量文件所在的目錄;
默認情況下預訓練詞向量文件和緩存文件的目錄位置都為當前目錄下的 .vector_cache目錄,雖然通過name參數指定了預訓練詞向量文件存在的目錄,但是因為緩存文件的目錄沒有特殊指定,此時在當前目錄下仍然需要存在 .vector_cache 目錄。

1 if not os.path.exists('.vector_cache'):
2     os.mkdir('.vector_cache')
3 vectors = Vectors(name='sgns.wiki.word')
4 TEXT.build_vocab(train_data, max_size=10000, vectors=vectors)

Embedding初始化還是一樣

1 pretrained_embedding = TEXT.vocab.vectors
2 print('pretrained_embedding:', pretrained_embedding.shape)  #torch.Size([1727, 300])
3 model.src_embed[0].lut.weight.data.copy_(pretrained_embedding) 
4 print('Embedding初始化')

 參考https://blog.csdn.net/leo_95/article/details/87708267

3.篇章級文本分類,將每一篇文檔按長度分三段保存,共用一份詞表

textfield可以定義多個屬性,text1,text2,text3。

 1 from nltk.tokenize import word_tokenize
 2 from torchtext import data as tdata
 3 from torchtext.vocab import GloVe
 4 from torchtext.vocab import Vectors
 5 
 6 def read_data(data_path, text_field, label_field, split=3, overlap=0):
 7     fields = []
 8     for i in range(1, split+1):
 9         fields.append(('text'+str(i), text_field))
10     fields.append(('label', label_field))
11 
12     examples = []
13 
14     with open(data_path) as csv_file:
15         reader = csv.reader(csv_file, quotechar='"')
16         for idx, line in enumerate(reader):
17             text = ""
18             for tx in line[1:]:
19                 text += tx              #tx就是一篇文檔
20                 text += " "
21                 word_tokens = word_tokenize(text)
22                 len_text = len(word_tokens)
23                 document_encode = []
24                 for i in range(split):
25                     len_true = int((len_text + overlap*(split-1)) / split)         #小文檔的真實長度
26                     len_rel = len_true - overlap
27                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
28                     document_encode.append(doc)
29 
30             label = int(line[0])
31             document_encode.append(label)
32             examples.append(tdata.Example.fromlist(document_encode, fields))
33     return examples, fields
34 
35 def data_doc_iter(train_path, test_path, text_field, label_field, batch_size, embedding_dim=50):
36     train_examples, train_fields = read_data(train_path, text_field, label_field)
37     test_examples, test_fields = read_data(test_path, text_field, label_field)
38 
39     train_dataset = tdata.Dataset(train_examples, train_fields)
40     test_dataset = tdata.Dataset(test_examples, test_fields)
41 
42     #構建詞表
43     text_field.build_vocab(train_dataset, vectors=GloVe(name='6B', dim=embedding_dim))        
44     label_field.build_vocab(train_dataset)
45 
46     train_iter = tdata.Iterator(train_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
47     test_iter = tdata.Iterator(test_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
48     vocabulary = text_field.vocab
49     return train_iter, test_iter, vocabulary

調用如下:

 1 text_field = tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=512, batch_first=True)       
 2 label_field = tdata.LabelField(dtype=torch.int) 
 3 train_iter, test_iter, vocabulary = data_doc_iter("./data/IMDB_new/train_shuffle.csv",  "./data/IMDB_new/test_new.csv", 
 4                                                         text_field, label_field, batch_size=8)
 5 
 6 for batch in train_iter:
 7     print(batch.text1.shape)
 8     print(batch.text2.shape)
 9     print(batch.text3.shape)
10     print(batch.label)

如果每個text的fix_length想設置得不一樣,text_field可以不同

fields = [('text_en', text_field), ('text_ch', text_field), ('text', text_field2), ('label', label_field)]

4.篇章級文本分類,將每一篇文檔按長度分三段保存,每一份文檔的詞表不同

即所有文檔的第一份使用第一份詞表,所有文檔的第二份使用第二份詞表,所有文檔的第三份使用第三份詞表。

每一份詞表是從.vector_cache中的glove.6B.50d.txt中的詞隨機抽取一半當作新的詞向量得到的,分別保存為cibiao1.txt,cibiao2.txt和cibiao3.txt。構建隨機特征子空間。

 1 from nltk.tokenize import word_tokenize
 2 from torchtext import data as tdata
 3 from torchtext.vocab import GloVe
 4 from torchtext.vocab import Vectors
 5 
 6 def read_split_data(data_path, text_fields, label_fields, split=3, overlap=0):
 7     
 8     field1, field2, field3 = [], [], []
 9     field1.append(('text', text_fields[0]))
10     field1.append(('label', label_fields[0]))
11     field2.append(('text', text_fields[1]))
12     field2.append(('label', label_fields[1]))
13     field3.append(('text', text_fields[2]))
14     field3.append(('label', label_fields[2]))       
15 
16     examp1, examp2, examp3 = [], [], [], []
17 
18     with open(data_path) as csv_file:
19         reader = csv.reader(csv_file, quotechar='"')
20         for idx, line in enumerate(reader):
21             text = ""
22             for tx in line[1:]:
23                 text += tx              #tx就是一篇文檔
24                 text += " "
25                 word_tokens = word_tokenize(text)
26                 len_text = len(word_tokens)
27                 document_encode = []
28                 for i in range(split):
29                     len_true = int((len_text + overlap*(split-1)) / split)         #小文檔的真實長度
30                     len_rel = len_true - overlap
31                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
32                     document_encode.append(doc)
33 
34             label = int(line[0])        
35             
36             doc1, doc2, doc3 = [], [], []
37             doc1.append(document_encode[0])
38             doc1.append(label)
39             examp1.append(tdata.Example.fromlist(doc1, field1))
40 
41             doc2.append(document_encode[1])
42             doc2.append(label)
43             examp2.append(tdata.Example.fromlist(doc2, field2))
44 
45             doc3.append(document_encode[2])
46             doc3.append(label)
47             examp3.append(tdata.Example.fromlist(doc3, field3))
48 
49     return examp1, examp2, examp3, field1, field2, field3
50 
51 
52 def data_docsplit_iter(train_path, test_path, text_fields, label_fields, batch_size):
53     train1_examp, train2_examp, train3_examp, field1, field2, field3 = read_split_data(train_path, text_fields, label_fields)
54     test1_examp, test2_examp, test3_examp, tfield1, tfield2, tfield3 = read_split_data(test_path, text_fields, label_fields)
55 
56     #構建詞表
57     train1_data = tdata.Dataset(train1_examp, field1)
58     train2_data = tdata.Dataset(train2_examp, field2)
59     train3_data = tdata.Dataset(train3_examp, field3)
60 
61     vectors1 = Vectors(name='cibiao1.txt')
62     vectors2 = Vectors(name='cibiao2.txt')
63     vectors3 = Vectors(name='cibiao3.txt')
64 
65     text_fields[0].build_vocab(train1_data, vectors=vectors1)
66     text_fields[1].build_vocab(train2_data, vectors=vectors2)
67     text_fields[2].build_vocab(train3_data, vectors=vectors3)
68 
69     label_fields[0].build_vocab(train1_data)
70     label_fields[1].build_vocab(train2_data)
71     label_fields[2].build_vocab(train3_data)
72 
73     test1_data = tdata.Dataset(test1_examp, tfield1)
74     test2_data = tdata.Dataset(test2_examp, tfield2)
75     test3_data = tdata.Dataset(test3_examp, tfield3)
76     
77     train_iter1 = tdata.Iterator(train1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
78     train_iter2 = tdata.Iterator(train2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
79     train_iter3 = tdata.Iterator(train3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
80 
81     test_iter1 = tdata.Iterator(test1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
82     test_iter2 = tdata.Iterator(test2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
83     test_iter3 = tdata.Iterator(test3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
84   
85     vocabulary1, vocabulary2, vocabulary3 = text_fields[0].vocab, text_fields[1].vocab, text_fields[2].vocab
86     return train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3

調用如下:

 1 SENTENCE_LIMIT_SIZE = 512
 2 DATAPATH = './data/IMDB_new/'
 3 
 4 text_fields, label_fields = [], []
 5 for i in range(3):
 6     text_fields.append(tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=SENTENCE_LIMIT_SIZE, batch_first=True))
 7     label_fields.append(tdata.LabelField(dtype=torch.int))
 8 
 9 train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3 = data_docsplit_iter(DATAPATH + "train_shuffle.csv", DATAPATH + "test_new.csv", 
10                                                                                                             text_fields, label_fields, batch_size=4)        
11 print('vocabulary1.vectors.shape = ', vocabulary1.vectors)
12 print('vocabulary2.vectors.shape = ', vocabulary2.vectors.shape)
13 print('vocabulary3.vectors.shape = ', vocabulary3.vectors.shape)          
14 for i, batch in enumerate(zip(train_iter1, train_iter2, train_iter3)):  
15     print(batch[0].text)
16     print(batch[0].label)
17     print(batch[1].text)
18     print(batch[1].label)
19     break

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM