如何读取中文文本信息
修改源码中的 DataProcessor类
指定文件路径
读取文本内容
文本信息预处理
分别将id、text、label分离
返回data
具体源码如下:
class SimProcessor(DataProcessor): """Processor for the Sim task""" def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" file_path = os.path.join(data_dir,'train_sentiment.txt') #指定待读取文件的路径 f=open(file_path,'r',encoding='utf-8') #读取文本文件的内容 #按照id、data、label的格式分开 train_data = [] index = 0 for line in f.readlines(): index = "%s-%s" % (index) #划分id line =line.replace('\n','').split('\t')#切分一行文本数据 text_a = tokenization.convert_to_unicode(str(line[1])) #获取文本内容 label = str(line[2]) train_data.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) ) index +=1 return train_data def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'test_sentiment.txt') f = open(file_path, 'r', encoding='utf-8') dev_data = [] index = 0 for line in f.readlines(): guid = 'dev-%d' % index line = line.replace("\n", "").split("\t") text_a = tokenization.convert_to_unicode(str(line[1])) label = str(line[2]) dev_data.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) index += 1 return dev_data def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.csv') test_df = pd.read_csv(file_path, encoding='utf-8') test_data = [] for index, test in enumerate(test_df.values): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(test[0])) # text_b = tokenization.convert_to_unicode(str(test[1])) label = str(test[1]) test_data.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return test_data def get_labels(self): return ['0', '1', '2']
项目运行参数配置如下:
