def split_train_test(text_df, size=0.8): """ 分割訓練集和測試集 """ # 為保證每個類中的數據能在訓練集中和測試集中的比例相同,所以需要依次對每個類進行處理 train_text_df = pd.DataFrame() test_text_df = pd.DataFrame() labels = [0, 1, 2, 3] for label in labels: # 找出label的記錄 text_df_w_label = text_df[text_df['label'] == label] # 重新設置索引,保證每個類的記錄是從0開始索引,方便之后的拆分 text_df_w_label = text_df_w_label.reset_index() # 默認按80%訓練集,20%測試集分割 # 這里為了簡化操作,取前80%放到訓練集中,后20%放到測試集中 # 當然也可以隨機拆分80%,20%(嘗試實現下DataFrame中的隨機拆分) # 該類數據的行數 n_lines = text_df_w_label.shape[0] split_line_no = math.floor(n_lines * size) text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :] text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :] # 放入整體訓練集,測試集中 train_text_df = train_text_df.append(text_df_w_label_train) test_text_df = test_text_df.append(text_df_w_label_test) train_text_df = train_text_df.reset_index() test_text_df = test_text_df.reset_index() return train_text_df, test_text_df