python 缺失值填充——固定值填充法


#!/bin/python2
###    Author: huangning      ###
## Email: tonyandrewhn@126.com ##
#################################
import warnings
import time
import sys
import datetime
import json
import pandas as pd
import numpy as np

from collections import Counter
#sys.path.append("D:\\huangning\\自用腳本\\bdci-dev\\sef_def_logger.py")
from sel_def_logger import MyLog
#import sef_def_logger
# warnings.filterwarnings('ignore')

class bdci():
    """docstring for bdci"""

    #logging.basicConfig(filename="std.log", format='%(asctime)s %(message)s', filemode='w') 
    #logger=logging.getLogger() 
    #logger.setLevel(logging.DEBUG) 

    def __init__(self):
        self.train_bank_path = 'D:\\huangning\\DataSet\\個貸違約預測\\train_dataset\\train_public.csv'
        self.train_internet_path = 'D:\\huangning\\DataSet\\個貸違約預測\\train_dataset\\train_internet.csv'
        self.testData_path = 'D:\\huangning\\DataSet\\個貸違約預測\\test_public.csv'
        self.work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1, '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9}
        self.logging = MyLog().logger

    def dataset_Fillnan(self, train_data, columns):
        ### 數據集缺失值填充
        ### fillType :  填充方式,如fillnan, fillrandom, ......
        ### columns : 需要填充的字段值

        self.logging.info("bdci." + sys._getframe().f_code.co_name + ".service MSG: ------------------------- 開始對數據集缺失值填充...... ------------------------")
        fillType = input("請選擇您需要缺失值填充方式(eg: ['固定值填充':'0','前置值填充':'1']:")
        #print("--- 固定值填充:請輸入 0")

        #fillTypeDict = {0:"fixed_value","before_value":1,"after_value":2,"random_value":3,"predict_value":4}
        if fillType == "0":
            ### fixed value fillnan
            fixed_value = input("請輸入" + columns + "列固定填充值:(eg: 推薦值:" + str(Counter(train_data[columns]).most_common(3)) + "):")
            self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 已獲取缺失值填充參數:{'缺失值填充方法':'固定值填充','固定值':" + fixed_value + "},准備開始缺失值填充......")
            train_data[columns].fillna(fixed_value, inplace=True)
            if train_data[columns].empty:
                #print("仍舊有空值")
                self.logging.error("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列" + columns +"缺失值以固定值方式填充失敗,請查看原因!")
            else:
                #print("該列無空值")
                self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列" + columns +"缺失值以固定值方式填充成功!")
            return train_data
        elif fillType == 1:
            ### before value fillnan
            return "前值填充法 暫未開放......"
        else:
            ### 其他填充法
            return "其他填充法 暫未開放......"

    def dataset_FillBatch(self, DataSetName):
        #pass
        #print(DataSetName.columns)
        self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: ---------------- 開始批量化替換空缺值 ---------------")
        i = 1
        for cols in DataSetName.columns:
            ###
            #print("第" + str(i) + "列:" + cols + "准備處理......STARTING......")
            if i <= len(DataSetName.columns):
                #print(train1_data[cols])
                if DataSetName[cols].isnull().any():
                    ### 當前列為空
                    #bdci.logging.info("當前列" + cols + "為空")
                    # -bug1: 數據f0開始 填充開始緩慢
                    train_data = self.dataset_Fillnan(DataSetName, cols)
                    #break
                else:
                    bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 當前列" + cols + "沒有空值,繼續遍歷......")
                #i = i + 1
            else:
                bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 數據集列遍歷完成......")
                break
            i = i + 1
            ## print("末尾第" + str(i) + "列已完成處理......ENDING......")
        return train_data


}

### 實例化類
bdci = bdci()

### 獲取訓練數據集
train_bank = bdci.DataReader("train_bank")
train_internet = bdci.DataReader("train_internet")
test_public = bdci.DataReader("test_public")
#print(train_bank)

### 字段一致性保持
train_bank.rename(columns={'isDefault':'is_default'},inplace=True)


### 獲取訓練數據集中公共樣本——列名list
common_cols = bdci.getCommon_cols(train_bank,train_internet)
print(len(common_cols))

### 獲取非公共樣本——列名list
train_internet_left = bdci.getleft_cols("train_internet")
train_bank_left = bdci.getleft_cols("train_bank")
#print(train_internet_left)
#print(train_bank_left)

### 獲取公有字段數據集
train1_data = bdci.getCommon_colsdata("train_internet")
train2_data = bdci.getCommon_colsdata("train_bank")
test_data = bdci.getCommon_colsdata("test_public")



## 日期轉化為pandas認可的格式
train1_data = bdci.dateTransformer(train1_data)
train2_data = bdci.dateTransformer(train2_data)


bdci.logging.info("開始處理數據填充---------- 數據集train1_data.csv -----------")
train1_data_filled = bdci.dataset_FillBatch(train1_data)

train1_data_mapped = bdci.propMapping(train1_data_filled)
train1_data_mapped.to_csv('data\\train1_data.csv', sep=',', header=True, index=True)


bdci.logging.info("開始處理數據填充---------- 數據集train1_data.csv -----------")
train2_data_filled = bdci.dataset_FillBatch(train2_data)
train2_data_mapped = bdci.propMapping(train2_data_filled)
train2_data_mapped.to_csv('data\\train2_data.csv', sep=',', header=True, index=True)

主要是方法:

dataset_FillBatch() 和
dataset_Fillnan()
 
        

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM