#auther bioamin #nlp of 電商評論 #-*- conding = utf-8 -*- import numpy as np import pandas as pd #畫圖的包 import matplotlib.pyplot as plt import seaborn as sns plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False #忽略警告 import warnings warnings.filterwarnings("ignore") import missingno new_cloumname = {"Clothing ID":"服裝id","Age":"年齡","Title":"評論者標題", "Review Text":"評論內容","Rating":"分數","Recommended IND":"是否推薦", "Positive Feedback Count":"贊同該評論的人數", "Division Name":"產品高級分類", "Department Name":"產品大類", "Class Name":"產品的二級分類"} #讀取數據並且利用 rename() 函數輸入字典替換列名 df = pd.read_csv("Comment.csv") df=df.rename(columns = new_cloumname) df.head()

missingno.matrix(df,fontsize = 25)

col = "評論內容" #刪除缺失值 df = df.dropna(subset=[col]) missingno.matrix(df,fontsize = 25)

#增加新特征 df["評論字數"]=df["評論內容"].astype(str).apply(len) df.head()

#評分和字數 數據可視化 #先sns.FacetGrid畫出輪廓 #然后用map填充內容 g = sns.FacetGrid(data = df,col = "分數") g.map(plt.hist,"評論字數",bins=10) plt.show() #評分越高,評論字數越多

#評分與字數關系的可視化2 sns.pointplot(x="分數",y="評論字數",data=df) plt.show() #評分為3的人最喜歡寫評論

#找出與服裝評分相關性最強的10個變量 k=10 #df.corr() 相關系數的計算可以調用pearson方法或kendell方法或spearman方法,默認使用pearson方法。 #round() 四舍五入,保留2位小數 corr = round(df.corr(),2) corr

#panadas.Dataframe # pd.DataFrame.nlargest(n ,cloumns,keep="first") get the row of a DataFrame by the largest values of columns cols = corr.nlargest(k,'分數') cols

cols = corr.nlargest(k,'分數')["分數"].index cm = round(df[cols].corr(),2) #np.zeros_like() Return an array of zeros with the same shape and type as a given array. # mask = np.zeros_like(cm,dtype = np.bool) mask[np.triu_indices_from(mask)] = True plt.figure(figsize = (8,8)) cmap = sns.diverging_palette(220,10,as_cmap = True) sns.heatmap(cm,mask = mask,cmap = cmap ,center = 0,annot = True ,cbar_kws = {"shrink":.5})

df.groupby(["分數",pd.cut(df["年齡"],np.arange(0,100,10))]).size().unstack(0).plot.bar(stacked = True) plt.show()

#產品類別與年齡 df.groupby(["產品大類",pd.cut(df["年齡"],np.arange(0,100,10))]).size().unstack(0).plot.bar(stacked = True) plt.show() 30-40 40-50是購物的主要年齡段 在各個產品類別中,top在各個年齡段的銷量都不錯

