加州房價預測數據預處理

本文轉載自查看原文 2018-09-07 09:45 1808 pandas/ python/ visualization/ 機器學習/ machine learning

本文是該系列讀書筆記的第二章數據預處理部分
獲取數據
數據的初步分析，數據探索
地理分布
數據特征的相關性
創建新的特征
數據清洗，創建處理流水線

本文是該系列讀書筆記的第二章數據預處理部分

導入常用的數據分析庫

import pandas as pd
import numpy as np

import os 
import tarfile
from six.moves import urllib

獲取數據

download_root="https://raw.githubusercontent.com/ageron/handson-ml/master/"
house_path="datasets/housing"
housing_url=download_root+house_path+"/housing.tgz"

def fecthing_housing_data(housing_url=housing_url,house_path=house_path):
    if not os.path.exists(house_path):
        os.makedirs(house_path)
    tgz_path=os.path.join(house_path,'housing.tgz')
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=house_path)
    housing_tgz.close()

def load_housing_data(house_path=house_path):
    csv_path=os.path.join(house_path,"housing.csv")
    return pd.read_csv(csv_path)

數據的初步分析，數據探索

# fecthing_housing_data()  # 下載數據，解壓出csv文件
housing=load_housing_data()
housing.head()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

housing.info()
# total_bedrooms 存在缺失值，
# 前9列為float格式，經度，維度，房齡中位數，總的房間數，卧室數目，人口，家庭數，收入中位數，房屋價格的中位數，
# 最后一列為離海距離為object類型

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

# 需要查看ocean_proximity都包含哪些,
housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# 對數值類型的特征進行初步的統計
housing.describe()

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
count	20640.000000	20640.000000	20640.000000	20640.000000	20433.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	-119.569704	35.631861	28.639486	2635.763081	537.870553	1425.476744	499.539680	3.870671	206855.816909
std	2.003532	2.135952	12.585558	2181.615252	421.385070	1132.462122	382.329753	1.899822	115395.615874
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900	14999.000000
25%	-121.800000	33.930000	18.000000	1447.750000	296.000000	787.000000	280.000000	2.563400	119600.000000
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534800	179700.000000
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743250	264725.000000
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100	500001.000000

%matplotlib inline
import matplotlib.pyplot as plt
# 查看每個數值特征的分布，
housing.hist(bins=50,figsize=(20,15))
# plt.show()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000179D4A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A2A128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A557B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019A7AE48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019AAB550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B03278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B29908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000019B53F98>]],
      dtype=object)

地理分布

housing.plot(kind="scatter", x="longitude", y="latitude")

<matplotlib.axes._subplots.AxesSubplot at 0x19bbfcc0>

housing.plot(kind="scatter", x="longitude", y="latitude",alpha=0.4)
# 標量，可選，默認值無，alpha混合值，介於0（透明）和1（不透明）之間
# 顯示高密度區域的散點圖，顏色越深，表示人口越密集，雖然我對加州的地理位置不是特別清楚

<matplotlib.axes._subplots.AxesSubplot at 0x1a705b70>

housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
            s=housing['population']/50,label='population',
            c='median_house_value',cmap=plt.get_cmap("jet"),colorbar=True,
            figsize=(9,6))
# import matplotlib
# plt.figure(figsize=(15,9)) 
# sc=plt.scatter(housing['longitude'],housing['latitude'],alpha=0.4,
#             s=housing['population']/100,label='population',
#             c=housing['median_house_value'],cmap=plt.get_cmap("jet"))
# plt.legend()
# matplotlib.rcParams["font.sans-serif"]=["SimHei"]
# matplotlib.rcParams['axes.unicode_minus'] = False
# matplotlib.rcParams['font.size'] =15
# plt.xlabel('經度')
# plt.ylabel('緯度')
# color_bar=plt.colorbar(sc)
# color_bar.set_label('meidan_house_value')
# plt.show()
#以上為使用plt的完整代碼，將坐標軸的內容以及添加colorbar，設置中文坐標軸標題

<matplotlib.axes._subplots.AxesSubplot at 0x19ffb390>

#  房價與位置和人口密度聯系密切，但是如何用數學的角度來描述幾個變量之間的關聯呢，可以使用標准相關系數standard correlation coefficient 
# 常用的相關系數為皮爾遜相關系數
corr_matrix = housing.corr()
corr_matrix

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
longitude	1.000000	-0.924664	-0.108197	0.044568	0.069608	0.099773	0.055310	-0.015176	-0.045967
latitude	-0.924664	1.000000	0.011173	-0.036100	-0.066983	-0.108785	-0.071035	-0.079809	-0.144160
housing_median_age	-0.108197	0.011173	1.000000	-0.361262	-0.320451	-0.296244	-0.302916	-0.119034	0.105623
total_rooms	0.044568	-0.036100	-0.361262	1.000000	0.930380	0.857126	0.918484	0.198050	0.134153
total_bedrooms	0.069608	-0.066983	-0.320451	0.930380	1.000000	0.877747	0.979728	-0.007723	0.049686
population	0.099773	-0.108785	-0.296244	0.857126	0.877747	1.000000	0.907222	0.004834	-0.024650
households	0.055310	-0.071035	-0.302916	0.918484	0.979728	0.907222	1.000000	0.013033	0.065843
median_income	-0.015176	-0.079809	-0.119034	0.198050	-0.007723	0.004834	0.013033	1.000000	0.688075
median_house_value	-0.045967	-0.144160	0.105623	0.134153	0.049686	-0.024650	0.065843	0.688075	1.000000

數據特征的相關性

import seaborn as sns
plt.Figure(figsize=(25,20))
hm=sns.heatmap(corr_matrix,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size':9}, cmap="YlGnBu")
plt.show()

corr_matrix['median_house_value'].sort_values(ascending=False)
"""
相關系數的范圍是 -1 到 1。當接近 1 時，意味強正相關；
例如，當收入中位數增加時，房價中位數也會增加。
當相關系數接近 -1 時，意味強負相關；
緯度和房價中位數有輕微的負相關性（即，越往北，房價越可能降低）。
最后，相關系數接近 0，意味沒有線性相關性。
"""

# 使用pandas中的scatter_matrix 可以從另外一種角度分析多個變量之間的相關性
from pandas.plotting import  scatter_matrix
attributes=['median_house_value',"median_income","total_bedrooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,9))
# sns.pairplot(housing[['median_house_value',"median_income",]],height=5)
# 使用seaborn中的pariplot可以實現同樣的結果
housing.plot(kind="scatter",x='median_income',y='median_house_value',alpha=0.2)

<matplotlib.axes._subplots.AxesSubplot at 0x1e3df9e8>

創建新的特征

重點關注收入的中位數與房屋價值的中位數之間的關系，從上圖以及相關系數都可以得到兩者之間存在很明顯的正相關
可以清洗的看到向上的趨勢，並且數據點不是非常分散，
我們之前統計得到的最高房價位於5000000美元的水平線
從頻率分布直方圖hist可以看到housing_median_age ,meidan_house_value 具有長尾分布，可以嘗試對其進行log或者開根號等轉化
當然，不同項目的處理方法各不相同，但大體思路是相似的。

housing['rooms_per_household']=housing['total_rooms']/housing['households']
housing['bedrooms_per_room']= housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household']=housing['population']/housing['households']

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
# """
# 新的特征房間中，卧室占比與房屋價值中位數有着更明顯的負相關性，比例越低，房價越高；
# 每家的房間數也比街區的總房間數的更有信息，很明顯，房屋越大，房價就越高
# """

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
bedrooms_per_room          -0.255880
Name: median_house_value, dtype: float64

數據清洗，創建處理流水線

缺失值處理
處理object文本數據類型
特征放縮
構建模型pepeline
以上幾個步驟我們在之前的博客中基本上都已經用過，這里作為讀書筆記不會再過多的詳細解釋

# total_bedrooms特征缺失值處理
"""
- 去掉含有缺失值的樣本，dropna()
- 去掉含有缺失值的特征 dropna(axis=1)
- 進行填充（中位數，平均值，0，插值填充） fillna(housing['total_bedrooms'].median()) 較為方便的使用pandas中的方法
"""
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy='mean')
housing_num=housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

housing_num_trans=pd.DataFrame(imputer.transform(housing_num),columns=housing_num.columns)
housing_num_trans.info()
# 缺失值補齊，總覺得如果是缺失值處理的話，可以直接用pandas中的fillna會節省一點時間，在原始的數據上直接處理掉，后面也就不用再去擔心這個

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 12 columns):
longitude                   20640 non-null float64
latitude                    20640 non-null float64
housing_median_age          20640 non-null float64
total_rooms                 20640 non-null float64
total_bedrooms              20640 non-null float64
population                  20640 non-null float64
households                  20640 non-null float64
median_income               20640 non-null float64
median_house_value          20640 non-null float64
rooms_per_household         20640 non-null float64
bedrooms_per_room           20640 non-null float64
population_per_household    20640 non-null float64
dtypes: float64(12)
memory usage: 1.9 MB

# 處理文本object類型數據
from sklearn.preprocessing import  LabelEncoder
encoder= LabelEncoder()
house_cat=housing['ocean_proximity']
house_cat_encode=encoder.fit_transform(house_cat)
house_cat_encode

array([3, 3, 3, ..., 1, 1, 1], dtype=int64)

encoder.classes_

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

在之前博客中也提到類似的操作，改操作可能會將兩個臨近的值
比兩個疏遠的值更為相似，因此一般情況下，對與類標才會使用LabelEncoder,對於特征不會使用該方式對特征轉換
更為常用的操作是獨熱編碼，給每個分類創建一個二元屬性，比如當分類是INLAND，有則是1，沒有則是0
skleanrn中提供了編碼器OneHotEncoder，類似與pandas中pd.get_dummies()

from sklearn.preprocessing import OneHotEncoder
# OneHotEncoder只能對數值型數據進行處理,只接受2D數組
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(house_cat_encode.reshape((-1,1)))
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

# 使用LabelBinarizer 可以實現同樣的效果
from sklearn.preprocessing import  LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(house_cat)
housing_cat_1hot

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

# 直接在原始的數據上使用pandas.get_dummies()是最簡單的方法
pd.get_dummies(housing[['ocean_proximity']]).head()

	ocean_proximity_<1H OCEAN	ocean_proximity_INLAND	ocean_proximity_ISLAND	ocean_proximity_NEAR BAY	ocean_proximity_NEAR OCEAN
0	0	0	0	1	0
1	0	0	0	1	0
2	0	0	0	1	0
3	0	0	0	1	0
4	0	0	0	1	0

# 特征放縮 我們常用到的MinMaxScaler和StandandScaler兩種
# 一般會對不同范圍內的特征進行放縮，有助於優化算法收斂的速度（尤其是針對梯度提升的優化算法）
# 歸一化： 減去最小值，然后除以最大最小值的差
# 標准化： 減去平均值，然后除以方差，得到均值為0，方差為1的標准正態分布，受異常值影響比較小，決策樹和隨機森林不需要特征放縮
# 特征放縮一般針對訓練數據集進行transform_fit，對測試集數據進行transform

# 從划分數據集→pipeline
from sklearn.model_selection import  train_test_split
housing=load_housing_data()
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)  #  隨機采樣
from sklearn.model_selection import StratifiedShuffleSplit  #  分層采樣

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

for train_index, test_index in split.split(housing, housing["income_cat"]): # 按照收入中位數進行分層采樣
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
housing = strat_train_set.copy()  # 創建一個副本，以免損傷訓練集，

housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 11 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16354 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
median_house_value    16512 non-null float64
ocean_proximity       16512 non-null object
income_cat            16512 non-null float64
dtypes: float64(10), object(1)
memory usage: 1.5+ MB

#轉化流水線
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline=Pipeline([('imputer',Imputer(strategy='median')),('std_scaler',StandardScaler())])
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num=housing.drop('ocean_proximity',axis=1)
housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_cat=housing['ocean_proximity']
housing_cat_tr= LabelBinarizer().fit_transform(housing_cat)
housing_train=np.c_[housing_num_tr,housing_cat_tr]
housing_train.shape
#  數字特征與categoriy 特征不能同時進行轉化，需要進行FeatureUnion
# 你給它一列轉換器（可以是所有的轉換器），當調用它的transform()方法，每個轉換器的transform()會被並行執行，
# 等待輸出，然后將輸出合並起來，並返回結果
# 當然也可以通過分批轉化，然后通過np將轉化好的數據集合並，本質上沒有什么區別，只不過對於測試集仍然需要transform，然后再合並成轉化好的測試集

(16512, 14)

import os
import sys
sys.path.append(os.getcwd())
from future_encoders import ColumnTransformer
from future_encoders import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         1.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         1.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

np.allclose(housing_prepared, housing_train)

True

后續內容已經放在github上，篇幅過大就只能把數據預處理的部分整理在這里，然后把后續的算法的實現部分整理在github中

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 cap2 加州房價預測模型使用sklearn進行數據挖掘-房價預測(4)—數據預處理短期負荷預測(三)數據集預處理 TensorFlow從1到2（六）結構化數據預處理和心臟病預測使用Tensorflow搭建回歸預測模型之二：數據准備與預處理數據預處理數據探索和預處理數據預處理數據預處理技術 weka數據預處理

加州房價預測數據預處理

本文是該系列讀書筆記的第二章數據預處理部分

獲取數據

數據的初步分析，數據探索

地理分布

數據特征的相關性

創建新的特征

數據清洗， 創建處理流水線

后續內容已經放在github上，篇幅過大就只能把數據預處理的部分整理在這里，然后把后續的算法的實現部分整理在github中

免責聲明！

數據清洗，創建處理流水線