一、列表轉換成字典
self.cat_list = [] with open(os.path.join(self.raw_data, "cat.txt")) as f: for line in f.readlines(): self.cat_list.append(line.strip()) self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))
二、NLP生成字典
def remove_1a(content): # 去除標點字母數字 chinese = '[\u4e00-\u9fa5a-zA-Z0-9]+' str1 = re.findall(chinese, content) return ''.join(str1) def read_file(filename): """讀取文件數據""" contents, labels = [], [] with open(filename, mode='r', encoding='utf-8', errors='ignore') as f: for line in f: try: label, content = line.split(" ") if content: content = remove_1a(content) contents.append(list(content)) labels.append(label) except: pass return contents, labels def build_vocab(train_dir, vocab_dir, vocab_size=5000): """根據訓練集構建詞匯表,存儲""" data_train, _ = read_file(train_dir) all_data = [] for content in data_train: all_data.extend(content) counter = Counter(all_data) print(all_data) count_pairs = counter.most_common(vocab_size - 1) pairs = [] for i in count_pairs: if i[1] > 2: pairs.append(i) count_pairs = pairs words, _ = list(zip(*count_pairs)) # 添加一個 <PAD> 來將所有文本pad為同一長度 words = ['<PAD>'] + list(words) open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('\n'.join(words) + '\n')