天池_二手車交易價格預測數據分析

本文轉載自查看原文 2020-07-10 16:12 2802 建模例子

賽題數據鏈接 https://tianchi.aliyun.com/competition/entrance/231784/information

字典

Field	Description
SaleID	交易ID，唯一編碼
name	汽車交易名稱，已脫敏
regDate	汽車注冊日期，例如20160101，2016年01月01日
model	車型編碼，已脫敏
brand	汽車品牌，已脫敏
bodyType	車身類型：豪華轎車：0，微型車：1，廂型車：2，大巴車：3，敞篷車：4，雙門汽車：5，商務車：6，攪拌車：7
fuelType	燃油類型：汽油：0，柴油：1，液化石油氣：2，天然氣：3，混合動力：4，其他：5，電動：6
gearbox	變速箱：手動：0，自動：1
power	發動機功率：范圍 [ 0, 600 ]
kilometer	汽車已行駛公里，單位萬km
notRepairedDamage	汽車有尚未修復的損壞：是：0，否：1
regionCode	地區編碼，已脫敏
seller	銷售方：個體：0，非個體：1
offerType	報價類型：提供：0，請求：1
creatDate	汽車上線時間，即開始售賣時間
price	二手車交易價格（預測目標）
v系列特征	匿名特征，包含v0-14在內15個匿名特征

step1：導入模塊

## 基礎工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型預測的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 數據降維處理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 參數搜索和評價的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

step2：數據讀取

#導入訓練集和測試集
train_data =pd.read_csv('F:\\python\\天池_二手車交易價格預測\\used_car_train_20200313.csv',sep=' ')
test_data=pd.read_csv('F:\\python\\天池_二手車交易價格預測\\used_car_testB_20200421.csv',sep=' ')

1.數據量大小

## 輸出數據的大小信息
print('Train data shape:',train_data.shape)
print('TestA data shape:',test_data.shape)

Train data shape: (150000, 31)
TestA data shape: (50000, 30)

2.數據瀏覽

#前面三條數據+后面三條數據
train_data.head(3).append(train_data.tail(3))

3.數據信息查看info()

#info()可以查看特征類型，缺失情況
train_data.info()

4.查看列名

#通過.columns查看列名
train_data.columns

5.數據統計瀏覽

#.describe()
train_data.describe()

剩下的不復制過來了

step3：缺失值

#查看每列缺失情況
train_data.isnull().sum()

#查看缺失占比情況
train_data.isnull().sum()/len(train_data)

#缺失值可視化
missing=train_data.isnull().sum()
missing[missing>0].sort_values().plot.bar()  #將大於0的拿出來並排序

查看其他類型的空值，如‘-'’

#查看每個特征每個值的分布
for i in train_data.columns:
    print(train_data[i].value_counts())

發現notRepairedDamage：

#使用nan替代
train_data['notRepairedDamage'].replace('-',np.nan,inplace=True)

嚴重傾斜的數據，對因變量沒有意義，可以刪除

#刪除特征
del train_data["seller"]
del train_data["offerType"]

step4：y值的分布

#y值的畫圖
plt.figure(1)
train_data['price'].plot.hist()
plt.figure(2)
sns.distplot(train_data['price'])

價格不符合正態分布

step5：特征分析

1.區分類別特征和數字特征

#1.直接根據特征字段類型進行划分
#數據特征
numeric_features = train_data.select_dtypes(include=[np.number])
numeric_features.columns
#類別特征
categorical_features = train_data.select_dtypes(include=[np.object])
categorical_features.columns


#2.根據字典去分類,我們這次采用的是第二種
numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 
                    'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ]
categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 
                        'gearbox', 'notRepairedDamage', 'regionCode',
                        'seller',    'offerType']

查看每個類別特征有多少個nunique分布

#nunique
for i in categorical_features:
    print(i+'特征分布如下:')
    print('{}特征有{}個不同的值'.format(i,train_data[i].nunique()))
    print(train_data[i].value_counts())

數據特征

#相關性分析
numeric_features.append('price')
corr=train_data[numeric_features].corr()
print(corr['price'].sort_values(ascending=False),'\n')  #換行輸出

畫地熱圖

sns.heatmap(corr)

查看數字特征的偏度和峰度

#查看偏度峰度

for i in numeric_features:
print('{}'.format(i),'偏度:{:05.2f}'.format(train_data[i].skew()),' ','峰度:{:05.2f}'.format(train_data[i].kurt()))

數字特征可視化

#方法一 
f=pd.melt(train_data,value_vars=numeric_features)
g=sns.FacetGrid(f,col='variable',col_wrap=2,sharex=False,sharey=False)
g=g.map(sns.distplot,'value')    

#方法二，不過這個畫的圖片 比較擁擠
for i,col in enumerate(numeric_features):
    plt.subplot(9,2,i+1)
    sns.distplot(train_data[col])

#查看數據特征相互關系
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(train_data[columns],size=2)

變量和y的回歸關系可視化

fig,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8),(ax9,ax10))=plt.subplots(
        nrows=5,ncols=2,figsize=(24,20))

v_12_plot=train_data[['v_12','price']]
sns.regplot(x='v_12',y='price',data=v_12_plot,ax=ax1)

v_8_plot=train_data[['v_8','price']]
sns.regplot(x='v_8',y='price',data=v_8_plot,ax=ax2)

v_0_plot=train_data[['v_0','price']]
sns.regplot(x='v_0',y='price',data=v_0_plot,ax=ax3)

power_plot=train_data[['power','price']]
sns.regplot(x='power',y='price',data=power_plot,ax=ax4)

v_5_plot=train_data[['v_5','price']]
sns.regplot(x='v_5',y='price',data=v_5_plot,ax=ax5)

v_2_plot=train_data[['v_2','price']]
sns.regplot(x='v_2',y='price',data=v_2_plot,ax=ax6)

v_6_plot=train_data[['v_6','price']]
sns.regplot(x='v_6',y='price',data=v_6_plot,ax=ax7)

v_1_plot=train_data[['v_1','price']]
sns.regplot(x='v_1',y='price',data=v_1_plot,ax=ax8)

v_14_plot=train_data[['v_14','price']]
sns.regplot(x='v_14',y='price',data=v_14_plot,ax=ax9)

v_13_plot=train_data[['v_13','price']]
sns.regplot(x='v_13',y='price',data=v_13_plot,ax=ax10)

#類別特征的nunique分布
for i in categorical_features:
    print('{}: 有 {} 個不重復的值'.format(i,train_data[i].nunique()))

類別特征可視化

#類別特征畫箱型圖
#由上面的nunique()可見name和regionCode的值太多，不宜做圖，以此將這2個去掉
cols=['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage']

for i in cols:
    train_data[i]=train_data[i].astype('category')  #將數據類型變成類別型
    if train_data[i].isnull().any():
        train_data[i]=train_data[i].cat.add_categories(['MISSING'])
        train_data[i]=train_data[i].fillna('MISSING')
    
    
def boxplot(x,y,**kwargs):
    sns.boxplot(x=x,y=y)

f=pd.melt(train_data,id_vars=['price'],value_vars=cols)
g=sns.FacetGrid(f,col='variable',col_wrap=2, sharex=False, sharey=False, size=5)
g.map(boxplot,'value','price')

#畫小提琴圖
for i in cols:
    sns.violinplot(x=i,y='price',data=train_data)
    plt.show()  #很奇怪，如果沒有這個語句就只有一張圖片，有了就會繼續for循環

categorical_features = ['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage']


#類別特征的類別個數和y值的柱狀圖
def bar_plot(x,y,**kwargs):
    sns.barplot(x=x,y=y)
    x=plt.xticks(rotation=90)

f = pd.melt(train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, "value", "price")

#類別特征的每個類別頻數可視化(count_plot)
def count_plot(x,  **kwargs):
    sns.countplot(x=x)
    x=plt.xticks(rotation=90)

f = pd.melt(train_data,  value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(count_plot, "value")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 ML .NET 二手車價格預測（一）二手車項目（一） JavaWeb實現網上二手車交易平台小項目 ML .NET 二手車價格預測之再次訓練與參數調整（二）記瓜子二手車的一次面試爬取瓜子二手車代碼 Python scrapy框架爬取瓜子二手車信息數據 58同城二手車數據爬蟲——數字加密解碼（Python原創）解決信息不對稱——看區塊鏈技術如何普惠二手車交易消費者 Python數據分析——上海市二手房價格分析