用戶購買行為分析(聚類模型K-means)

本文轉載自查看原文 2019-11-27 20:23 360 Data Analysis

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# 讀取數據,四張表格
prior = pd.read_csv("order_products__prior.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("orders.csv")
aisles = pd.read_csv("aisles.csv")

# 合並四張表
_mg = pd.merge(prior, products, on=["product_id", "product_id"])
_mg = pd.merge(_mg, orders, on=["order_id", "order_id"])
mt = pd.merge(_mg, aisles, on=["aisle_id", "aisle_id"])

mt.head()

	order_id	product_id	add_to_cart_order	reordered	product_name	aisle_id	department_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order	aisle
0	2	33120	1	1	Organic Egg Whites	86	16	202279	prior	3	5	9	8.0	eggs
1	26	33120	5	0	Organic Egg Whites	86	16	153404	prior	2	0	16	7.0	eggs
2	120	33120	13	0	Organic Egg Whites	86	16	23750	prior	11	6	8	10.0	eggs
3	327	33120	5	1	Organic Egg Whites	86	16	58707	prior	21	6	9	8.0	eggs
4	390	33120	28	1	Organic Egg Whites	86	16	166654	prior	48	0	12	9.0	eggs

# 交叉表（特殊的分組工具）
cross = pd.crosstab(mt["user_id"], mt["aisle"])

cross.head(10)

aisle	air fresheners candles	asian foods	baby accessories	baby bath body care	baby food formula	bakery desserts	baking ingredients	baking supplies decor	beauty	beers coolers	...	spreads	tea	tofu meat alternatives	tortillas flat bread	trail mix snack mix	trash bags liners	vitamins supplements	water seltzer sparkling water	white wines	yogurt
user_id
1	0	0	0	0	0	0	0	0	0	0	...	1	0	0	0	0	0	0	0	0	1
2	0	3	0	0	0	0	2	0	0	0	...	3	1	1	0	0	0	0	2	0	42
3	0	0	0	0	0	0	0	0	0	0	...	4	1	0	0	0	0	0	2	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	1	0	0	0	1	0	0
5	0	2	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	3
6	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	2	0	0	0	...	0	0	0	0	0	0	0	0	0	5
8	0	1	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
9	0	0	0	0	6	0	2	0	0	0	...	0	0	0	0	0	0	0	2	0	19
10	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	2

10 rows × 134 columns

# 主成分分析
pca = PCA(n_components=0.9)

data = pca.fit_transform(cross)

data.shape  # 原本有134列，經過主成分分析后，只保留了27列

(206209, 27)

# 為方便計算，取較少數據
x = data[:500]
x.shape

(500, 27)

# 假設用戶一共分為四個類別
km = KMeans(n_clusters=4)
km.fit(x)

predict = km.predict(x)
predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 2, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 2, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1,
       1, 1, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

# 顯示聚類的結果
plt.figure(figsize=(10, 10))

<Figure size 720x720 with 0 Axes>




<Figure size 720x720 with 0 Axes>

# 建立四個顏色的列表
colored = ['orange', 'blue', 'purple', 'green']
colr = [colored[i] for i in predict]

# 假設X軸為第三特征， Y軸為18特征
plt.scatter(x[:, 3], x[:, 18], color=colr)
plt.xlabel('3')
plt.ylabel('18')
plt.show()

# 評判聚類效果，輪廓系數
silhouette_score(x, predict)

0.6115021999326935

K-means通常被稱為勞埃德算法，這在數據聚類中是最經典的，也是相對容易理解的模型。算法執行的過程分為4個階段。

1.首先，隨機設K個特征空間內的點作為初始的聚類中心。
2.然后，對於根據每個數據的特征向量，從K個聚類中心中尋找距離最近的一個，並且把該數據標記為這個聚類中心。
3.接着，在所有的數據都被標記過聚類中心之后，根據這些數據新分配的類簇，通過取分配給每個先前質心的所有樣本的平均值來創建新的質心重,新對K個聚類中心做計算。
4.最后，計算舊和新質心之間的差異,如果所有的數據點從屬的聚類中心與上一次的分配的類簇沒有變化，那么迭代就可以停止，否則回到步驟2繼續循環。

K均值等於具有小的全對稱協方差矩陣的期望最大化算法

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【機器學習】k-means——航空用戶聚類分析案例使用K-means和高斯混合模型對圖像進行聚類基於K－Means的文本聚類聚類算法——K-means（上） K-means聚類算法 K-Means 聚類分析學習筆記 Spark ML聚類分析之k-means|| k均值聚類（k-means clustering） matlab練習程序（k-means聚類） java實現K-means聚類算法