如何讀取中文文本信息
修改源碼中的 DataProcessor類
指定文件路徑
讀取文本內容
文本信息預處理
分別將id、text、label分離
返回data
具體源碼如下:
class SimProcessor(DataProcessor):
"""Processor for the Sim task"""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
file_path = os.path.join(data_dir,'train_sentiment.txt') #指定待讀取文件的路徑
f=open(file_path,'r',encoding='utf-8') #讀取文本文件的內容
#按照id、data、label的格式分開
train_data = []
index = 0
for line in f.readlines():
index = "%s-%s" % (index) #划分id
line =line.replace('\n','').split('\t')#切分一行文本數據
text_a = tokenization.convert_to_unicode(str(line[1])) #獲取文本內容
label = str(line[2])
train_data.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
)
index +=1
return train_data
def get_dev_examples(self, data_dir):
file_path = os.path.join(data_dir, 'test_sentiment.txt')
f = open(file_path, 'r', encoding='utf-8')
dev_data = []
index = 0
for line in f.readlines():
guid = 'dev-%d' % index
line = line.replace("\n", "").split("\t")
text_a = tokenization.convert_to_unicode(str(line[1]))
label = str(line[2])
dev_data.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
index += 1
return dev_data
def get_test_examples(self, data_dir):
file_path = os.path.join(data_dir, 'test.csv')
test_df = pd.read_csv(file_path, encoding='utf-8')
test_data = []
for index, test in enumerate(test_df.values):
guid = 'test-%d' % index
text_a = tokenization.convert_to_unicode(str(test[0]))
# text_b = tokenization.convert_to_unicode(str(test[1]))
label = str(test[1])
test_data.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return test_data
def get_labels(self):
return ['0', '1', '2']
項目運行參數配置如下:

