python(十):列表轉換成字典


一、列表轉換成字典

        self.cat_list = []
        with open(os.path.join(self.raw_data, "cat.txt")) as f:
            for line in f.readlines():
                self.cat_list.append(line.strip())
        self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))

二、NLP生成字典

def remove_1a(content):
    # 去除標點字母數字
    chinese = '[\u4e00-\u9fa5a-zA-Z0-9]+'
    str1 = re.findall(chinese, content)
    return ''.join(str1)


def read_file(filename):
    """讀取文件數據"""
    contents, labels = [], []
    with open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                label, content = line.split("  ")
                if content:
                    content = remove_1a(content)
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根據訓練集構建詞匯表,存儲"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    print(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    pairs = []
    for i in count_pairs:
        if i[1] > 2:
            pairs.append(i)
    count_pairs = pairs
    words, _ = list(zip(*count_pairs))
    # 添加一個 <PAD> 來將所有文本pad為同一長度
    words = ['<PAD>'] + list(words)
    open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('\n'.join(words) + '\n')

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM