#!/bin/python2 ### Author: huangning ### ## Email: tonyandrewhn@126.com ## ################################# import warnings import time import sys import datetime import json import pandas as pd import numpy as np from collections import Counter #sys.path.append("D:\\huangning\\自用腳本\\bdci-dev\\sef_def_logger.py") from sel_def_logger import MyLog #import sef_def_logger # warnings.filterwarnings('ignore') class bdci(): """docstring for bdci""" #logging.basicConfig(filename="std.log", format='%(asctime)s %(message)s', filemode='w') #logger=logging.getLogger() #logger.setLevel(logging.DEBUG) def __init__(self): self.train_bank_path = 'D:\\huangning\\DataSet\\個貸違約預測\\train_dataset\\train_public.csv' self.train_internet_path = 'D:\\huangning\\DataSet\\個貸違約預測\\train_dataset\\train_internet.csv' self.testData_path = 'D:\\huangning\\DataSet\\個貸違約預測\\test_public.csv' self.work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1, '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9} self.logging = MyLog().logger def dataset_Fillnan(self, train_data, columns): ### 數據集缺失值填充 ### fillType : 填充方式,如fillnan, fillrandom, ...... ### columns : 需要填充的字段值 self.logging.info("bdci." + sys._getframe().f_code.co_name + ".service MSG: ------------------------- 開始對數據集缺失值填充...... ------------------------") fillType = input("請選擇您需要缺失值填充方式(eg: ['固定值填充':'0','前置值填充':'1']:") #print("--- 固定值填充:請輸入 0") #fillTypeDict = {0:"fixed_value","before_value":1,"after_value":2,"random_value":3,"predict_value":4} if fillType == "0": ### fixed value fillnan fixed_value = input("請輸入" + columns + "列固定填充值:(eg: 推薦值:" + str(Counter(train_data[columns]).most_common(3)) + "):") self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 已獲取缺失值填充參數:{'缺失值填充方法':'固定值填充','固定值':" + fixed_value + "},准備開始缺失值填充......") train_data[columns].fillna(fixed_value, inplace=True) if train_data[columns].empty: #print("仍舊有空值") self.logging.error("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列" + columns +"缺失值以固定值方式填充失敗,請查看原因!") else: #print("該列無空值") self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列" + columns +"缺失值以固定值方式填充成功!") return train_data elif fillType == 1: ### before value fillnan return "前值填充法 暫未開放......" else: ### 其他填充法 return "其他填充法 暫未開放......" def dataset_FillBatch(self, DataSetName): #pass #print(DataSetName.columns) self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: ---------------- 開始批量化替換空缺值 ---------------") i = 1 for cols in DataSetName.columns: ### #print("第" + str(i) + "列:" + cols + "准備處理......STARTING......") if i <= len(DataSetName.columns): #print(train1_data[cols]) if DataSetName[cols].isnull().any(): ### 當前列為空 #bdci.logging.info("當前列" + cols + "為空") # -bug1: 數據f0開始 填充開始緩慢 train_data = self.dataset_Fillnan(DataSetName, cols) #break else: bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 當前列" + cols + "沒有空值,繼續遍歷......") #i = i + 1 else: bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列遍歷完成......") break i = i + 1 ## print("末尾第" + str(i) + "列已完成處理......ENDING......") return train_data } ### 實例化類 bdci = bdci() ### 獲取訓練數據集 train_bank = bdci.DataReader("train_bank") train_internet = bdci.DataReader("train_internet") test_public = bdci.DataReader("test_public") #print(train_bank) ### 字段一致性保持 train_bank.rename(columns={'isDefault':'is_default'},inplace=True) ### 獲取訓練數據集中公共樣本——列名list common_cols = bdci.getCommon_cols(train_bank,train_internet) print(len(common_cols)) ### 獲取非公共樣本——列名list train_internet_left = bdci.getleft_cols("train_internet") train_bank_left = bdci.getleft_cols("train_bank") #print(train_internet_left) #print(train_bank_left) ### 獲取公有字段數據集 train1_data = bdci.getCommon_colsdata("train_internet") train2_data = bdci.getCommon_colsdata("train_bank") test_data = bdci.getCommon_colsdata("test_public") ## 日期轉化為pandas認可的格式 train1_data = bdci.dateTransformer(train1_data) train2_data = bdci.dateTransformer(train2_data) bdci.logging.info("開始處理數據填充---------- 數據集train1_data.csv -----------") train1_data_filled = bdci.dataset_FillBatch(train1_data) train1_data_mapped = bdci.propMapping(train1_data_filled) train1_data_mapped.to_csv('data\\train1_data.csv', sep=',', header=True, index=True) bdci.logging.info("開始處理數據填充---------- 數據集train1_data.csv -----------") train2_data_filled = bdci.dataset_FillBatch(train2_data) train2_data_mapped = bdci.propMapping(train2_data_filled) train2_data_mapped.to_csv('data\\train2_data.csv', sep=',', header=True, index=True)
主要是方法:
dataset_FillBatch() 和
dataset_Fillnan()
