Python數據分析(二)pandas缺失值處理


import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print('################缺失值判斷######################')
print('--------Series的缺失值判斷---------')
print (df['one'].isnull())
'''
--------Series的缺失值判斷---------
a    False
b True c False d True e False f False g True h False Name: one, dtype: bool
'''
print('---------輸出Series缺失值和索引--------')
print(df['one'][df['one'].isnull()])
'''
---------輸出Series缺失值和索引--------
b   NaN
d   NaN
g   NaN
Name: one, dtype: float64

'''
print('--------dataframe的缺失值判斷---------') print(df.isnull())
'''
--------dataframe的缺失值判斷---------
     one    two  three
a  False  False  False
b   True   True   True
c  False  False  False
d   True   True   True
e  False  False  False
f  False  False  False
g   True   True   True
h  False  False  False

'''
print('--------輸出dataframe的缺失值和索引---------') data = df[df.isnull().values==True] print(data[~data.index.duplicated()])
'''
--------輸出dataframe的缺失值和索引---------
   one  two  three
b  NaN  NaN    NaN
d  NaN  NaN    NaN
g  NaN  NaN    NaN

'''
print('--------輸出dataframe的有缺失值的列---------') print(df.isnull().any())
'''
--------輸出dataframe的有缺失值的列---------
one      True
two      True
three    True
dtype: bool

'''
print('################缺失值過濾######################') print('--------Series的缺失值過濾---------') print(df['one'].isnull())
'''
################缺失值過濾######################
--------Series的缺失值過濾---------
a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

'''
print('--------使用dropna方法刪除缺失數據,返回一個刪除后的Series--------') print(df['one'].dropna())
'''
--------使用dropna方法刪除缺失數據,返回一個刪除后的Series--------
a   -0.211055 c -0.870090 e -0.203259 f 0.490568 h 1.437819 Name: one, dtype: float64

'''
print('--------dataframe的缺失值過濾---------') print(df.dropna())
'''
--------dataframe的缺失值過濾---------
        one       two     three
a -0.211055 -2.869212  0.022179 c -0.870090 -0.878423 1.071588 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 h 1.437819 -0.370934 -0.482307

'''
print('-------當行全為NaN的時候,才刪除,參數how默認是any,含有缺失值就刪除--------') print(df.dropna(how="all"))
'''
-------當行全為NaN的時候,才刪除,參數how默認是any,含有缺失值就刪除--------
        one       two     three
a -0.211055 -2.869212  0.022179 c -0.870090 -0.878423 1.071588 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 h 1.437819 -0.370934 -0.482307

'''
print('################缺失值填充######################') print('------指定特殊值填充缺失值-------') print(df.fillna(0))
'''
################缺失值填充######################
------指定特殊值填充缺失值-------
        one       two     three
a -0.211055 -2.869212  0.022179 b 0.000000 0.000000 0.000000 c -0.870090 -0.878423 1.071588 d 0.000000 0.000000 0.000000 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 g 0.000000 0.000000 0.000000 h 1.437819 -0.370934 -0.482307

'''
print('------不同的列用不同的值填充------') print(df.fillna({'one':1,'two':2,'three':3}))
'''
------不同的列用不同的值填充------
        one       two     three
a -0.211055 -2.869212  0.022179 b 1.000000 2.000000 3.000000 c -0.870090 -0.878423 1.071588 d 1.000000 2.000000 3.000000 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 g 1.000000 2.000000 3.000000 h 1.437819 -0.370934 -0.482307

'''
print('------前向填充------') print(df.fillna(method="ffill"))
'''
------前向填充------
        one       two     three
a -0.211055 -2.869212  0.022179 b -0.211055 -2.869212 0.022179 c -0.870090 -0.878423 1.071588 d -0.870090 -0.878423 1.071588 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 g 0.490568 -0.968058 -0.999899 h 1.437819 -0.370934 -0.482307

'''
print('------后向填充------') print(df.fillna(method="bfill"))
'''
------后向填充------
        one       two     three
a -0.211055 -2.869212  0.022179 b -0.870090 -0.878423 1.071588 c -0.870090 -0.878423 1.071588 d -0.203259 0.315897 0.495306 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 g 1.437819 -0.370934 -0.482307 h 1.437819 -0.370934 -0.482307

'''
print('------平均值填充------') print(df.fillna(df.mean()))
'''
------平均值填充------
        one       two     three
a -0.211055 -2.869212  0.022179 b 0.128797 -0.954146 0.021373 c -0.870090 -0.878423 1.071588 d 0.128797 -0.954146 0.021373 e -0.203259 0.315897 0.495306 f 0.490568 -0.968058 -0.999899 g 0.128797 -0.954146 0.021373 h 1.437819 -0.370934 -0.482307

'''


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM