# IMPORT >>> import numpy >>> from numpy import allclose >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> from pyspark.ml.classification import RandomForestClassifier # PREPARE DATA >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") >>> si_model = stringIndexer.fit(df) >>> td = si_model.transform(df) # BUILD THE MODEL >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) >>> model = rf.fit(td) # FEATURE IMPORTANCES >>> model.featureImportances SparseVector(1, {0: 1.0})
重要性:
model.featureImportances
pyspark 模型簡單實例:
https://blog.csdn.net/Katherine_hsr/article/details/80988994
概率:
predictions.select("probability", "label").show(1000)
probability--->即為輸出概率
pandas 打亂樣本:
import pandas as pd
df = pd.read_excel("window regulator01 _0914新增樣本.xlsx")
df = df.sample(frac = 1) #打亂樣本
pyspark train、test 隨機划分
train, test = labeled_v.randomSplit([0.75, 0.25])