賽題數據鏈接 https://tianchi.aliyun.com/competition/entrance/231784/information
字典
Field | Description |
---|---|
SaleID | 交易ID,唯一編碼 |
name | 汽車交易名稱,已脫敏 |
regDate | 汽車注冊日期,例如20160101,2016年01月01日 |
model | 車型編碼,已脫敏 |
brand | 汽車品牌,已脫敏 |
bodyType | 車身類型:豪華轎車:0,微型車:1,廂型車:2,大巴車:3,敞篷車:4,雙門汽車:5,商務車:6,攪拌車:7 |
fuelType | 燃油類型:汽油:0,柴油:1,液化石油氣:2,天然氣:3,混合動力:4,其他:5,電動:6 |
gearbox | 變速箱:手動:0,自動:1 |
power | 發動機功率:范圍 [ 0, 600 ] |
kilometer | 汽車已行駛公里,單位萬km |
notRepairedDamage | 汽車有尚未修復的損壞:是:0,否:1 |
regionCode | 地區編碼,已脫敏 |
seller | 銷售方:個體:0,非個體:1 |
offerType | 報價類型:提供:0,請求:1 |
creatDate | 汽車上線時間,即開始售賣時間 |
price | 二手車交易價格(預測目標) |
v系列特征 | 匿名特征,包含v0-14在內15個匿名特征 |
step1:導入模塊
## 基礎工具 import numpy as np import pandas as pd import warnings import matplotlib import matplotlib.pyplot as plt import seaborn as sns from scipy.special import jn from IPython.display import display, clear_output import time warnings.filterwarnings('ignore') %matplotlib inline ## 模型預測的 from sklearn import linear_model from sklearn import preprocessing from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor ## 數據降維處理的 from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA import lightgbm as lgb import xgboost as xgb ## 參數搜索和評價的 from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error
step2:數據讀取
#導入訓練集和測試集 train_data =pd.read_csv('F:\\python\\天池_二手車交易價格預測\\used_car_train_20200313.csv',sep=' ') test_data=pd.read_csv('F:\\python\\天池_二手車交易價格預測\\used_car_testB_20200421.csv',sep=' ')
1.數據量大小
## 輸出數據的大小信息 print('Train data shape:',train_data.shape) print('TestA data shape:',test_data.shape)
Train data shape: (150000, 31)
TestA data shape: (50000, 30)
2.數據瀏覽
#前面三條數據+后面三條數據 train_data.head(3).append(train_data.tail(3))
3.數據信息查看info()
#info()可以查看特征類型,缺失情況 train_data.info()
4.查看列名
#通過.columns查看列名 train_data.columns
5.數據統計瀏覽
#.describe() train_data.describe()
剩下的不復制過來了
step3:缺失值
#查看每列缺失情況 train_data.isnull().sum() #查看缺失占比情況 train_data.isnull().sum()/len(train_data) #缺失值可視化 missing=train_data.isnull().sum() missing[missing>0].sort_values().plot.bar() #將大於0的拿出來並排序
查看其他類型的空值,如‘-'’
#查看每個特征每個值的分布 for i in train_data.columns: print(train_data[i].value_counts())
發現notRepairedDamage:
#使用nan替代 train_data['notRepairedDamage'].replace('-',np.nan,inplace=True)
嚴重傾斜的數據,對因變量沒有意義,可以刪除
#刪除特征 del train_data["seller"] del train_data["offerType"]
step4:y值的分布
#y值的畫圖 plt.figure(1) train_data['price'].plot.hist() plt.figure(2) sns.distplot(train_data['price'])
價格不符合正態分布
step5:特征分析
1.區分類別特征和數字特征
#1.直接根據特征字段類型進行划分 #數據特征 numeric_features = train_data.select_dtypes(include=[np.number]) numeric_features.columns #類別特征 categorical_features = train_data.select_dtypes(include=[np.object]) categorical_features.columns #2.根據字典去分類,我們這次采用的是第二種 numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ] categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode', 'seller', 'offerType']
查看每個類別特征有多少個nunique分布
#nunique for i in categorical_features: print(i+'特征分布如下:') print('{}特征有{}個不同的值'.format(i,train_data[i].nunique())) print(train_data[i].value_counts())
數據特征
#相關性分析 numeric_features.append('price') corr=train_data[numeric_features].corr() print(corr['price'].sort_values(ascending=False),'\n') #換行輸出
畫地熱圖
sns.heatmap(corr)
查看數字特征的偏度和峰度
#查看偏度峰度
for i in numeric_features:
print('{}'.format(i),'偏度:{:05.2f}'.format(train_data[i].skew()),' ','峰度:{:05.2f}'.format(train_data[i].kurt()))
數字特征可視化
#方法一 f=pd.melt(train_data,value_vars=numeric_features) g=sns.FacetGrid(f,col='variable',col_wrap=2,sharex=False,sharey=False) g=g.map(sns.distplot,'value') #方法二,不過這個畫的圖片 比較擁擠 for i,col in enumerate(numeric_features): plt.subplot(9,2,i+1) sns.distplot(train_data[col])
#查看數據特征相互關系 columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14'] sns.pairplot(train_data[columns],size=2)
變量和y的回歸關系可視化
fig,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8),(ax9,ax10))=plt.subplots( nrows=5,ncols=2,figsize=(24,20)) v_12_plot=train_data[['v_12','price']] sns.regplot(x='v_12',y='price',data=v_12_plot,ax=ax1) v_8_plot=train_data[['v_8','price']] sns.regplot(x='v_8',y='price',data=v_8_plot,ax=ax2) v_0_plot=train_data[['v_0','price']] sns.regplot(x='v_0',y='price',data=v_0_plot,ax=ax3) power_plot=train_data[['power','price']] sns.regplot(x='power',y='price',data=power_plot,ax=ax4) v_5_plot=train_data[['v_5','price']] sns.regplot(x='v_5',y='price',data=v_5_plot,ax=ax5) v_2_plot=train_data[['v_2','price']] sns.regplot(x='v_2',y='price',data=v_2_plot,ax=ax6) v_6_plot=train_data[['v_6','price']] sns.regplot(x='v_6',y='price',data=v_6_plot,ax=ax7) v_1_plot=train_data[['v_1','price']] sns.regplot(x='v_1',y='price',data=v_1_plot,ax=ax8) v_14_plot=train_data[['v_14','price']] sns.regplot(x='v_14',y='price',data=v_14_plot,ax=ax9) v_13_plot=train_data[['v_13','price']] sns.regplot(x='v_13',y='price',data=v_13_plot,ax=ax10)
#類別特征的nunique分布 for i in categorical_features: print('{}: 有 {} 個不重復的值'.format(i,train_data[i].nunique()))
類別特征可視化
#類別特征畫箱型圖 #由上面的nunique()可見name和regionCode的值太多,不宜做圖,以此將這2個去掉 cols=['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage'] for i in cols: train_data[i]=train_data[i].astype('category') #將數據類型變成類別型 if train_data[i].isnull().any(): train_data[i]=train_data[i].cat.add_categories(['MISSING']) train_data[i]=train_data[i].fillna('MISSING') def boxplot(x,y,**kwargs): sns.boxplot(x=x,y=y) f=pd.melt(train_data,id_vars=['price'],value_vars=cols) g=sns.FacetGrid(f,col='variable',col_wrap=2, sharex=False, sharey=False, size=5) g.map(boxplot,'value','price')
#畫小提琴圖 for i in cols: sns.violinplot(x=i,y='price',data=train_data) plt.show() #很奇怪,如果沒有這個語句就只有一張圖片,有了就會繼續for循環
categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage'] #類別特征的類別個數和y值的柱狀圖 def bar_plot(x,y,**kwargs): sns.barplot(x=x,y=y) x=plt.xticks(rotation=90) f = pd.melt(train_data, id_vars=['price'], value_vars=categorical_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(bar_plot, "value", "price")
#類別特征的每個類別頻數可視化(count_plot) def count_plot(x, **kwargs): sns.countplot(x=x) x=plt.xticks(rotation=90) f = pd.melt(train_data, value_vars=categorical_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(count_plot, "value")