pandas缺失值填充時遇到的問題

本文轉載自查看原文 2020-03-18 23:28 869 pandas/ 填充/ 數據分析/ ffill/ 報錯

打比賽時，遇到了一個問題。填充空白值的時候，如果使用固定值，均值啥的都沒問題。
但是我想用

.fillna(method='pad',axis=0,inplace=True)

但是每次都是報錯

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-191-1252788aaf79> in <module>
----> 1 sh_car1.fillna(method='pad',axis=0,inplace=True)

C:\Anaconda3\lib\site-packages\pandas\core\frame.py in fillna(self, value, method, axis, inplace, limit, downcast, **kwargs)
   4242             limit=limit,
   4243             downcast=downcast,
-> 4244             **kwargs
   4245         )
   4246 

C:\Anaconda3\lib\site-packages\pandas\core\generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
   6235                 inplace=inplace,
   6236                 coerce=True,
-> 6237                 downcast=downcast,
   6238             )
   6239         else:

C:\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in interpolate(self, **kwargs)
    567 
    568     def interpolate(self, **kwargs):
--> 569         return self.apply("interpolate", **kwargs)
    570 
    571     def shift(self, **kwargs):

C:\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
    436                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
    437 
--> 438             applied = getattr(b, f)(**kwargs)
    439             result_blocks = _extend_blocks(applied, result_blocks)
    440 

C:\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in interpolate(self, method, axis, index, values, inplace, limit, limit_direction, limit_area, fill_value, coerce, downcast, **kwargs)
   1172                 fill_value=fill_value,
   1173                 coerce=coerce,
-> 1174                 downcast=downcast,
   1175             )
   1176         # validate the interp method

C:\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _interpolate_with_fill(self, method, axis, inplace, limit, fill_value, coerce, downcast)
   1226             limit=limit,
   1227             fill_value=fill_value,
-> 1228             dtype=self.dtype,
   1229         )
   1230         values = self._try_coerce_result(values)

C:\Anaconda3\lib\site-packages\pandas\core\missing.py in interpolate_2d(values, method, axis, limit, fill_value, dtype)
    481     method = clean_fill_method(method)
    482     if method == "pad":
--> 483         values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
    484     else:
    485         values = transf(

C:\Anaconda3\lib\site-packages\pandas\core\missing.py in pad_2d(values, limit, mask, dtype)
    546 
    547     if np.all(values.shape):
--> 548         algos.pad_2d_inplace(values, mask, limit=limit)
    549     else:
    550         # for test coverage

pandas\_libs\algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()

TypeError: No matching signature found

經過千辛萬苦終於找到了問題的根源。

原來，我在加載數據的時候使用了一個壓縮內存的函數

# 減少內存使用
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                print('column name :',col)
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

這里面產生了一種新的數據類型 np.float16
而這種類型，在pandas里是沒有的。
pandas里面只有的float類型。

np.float32類型都沒有問題。

所以在填充的時候就會報錯。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python—關於Pandas缺失值問題(國內唯一) 缺失值的常見填充方法 spss缺失值填充步驟 python 特征缺失值填充 Pandas系列教程（6）Pandas缺失值處理 pandas 缺失值、重復值的處理與值的替換 python中使用flask時遇到的markupsafe._compat包缺失的問題與解決 pandas numpy處理缺失值，none與nan比較 XGBoost缺失值引發的問題及其深度分析 [Python] Pandas 對數據進行查找、替換、篩選、排序、重復值和缺失值處理