package com.home.spark.ml
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
/**
* VectorAssembler是一種轉換器,它將給定的多個列組合為單個向量列。
* 這對於將原始特征和由不同特征轉換器生成的特征組合到單個特征向量中很有用,以便訓練諸如邏輯回歸和決策樹之類的ML模型。
*
* VectorAssembler接受以下輸入列類型:所有數字類型,布爾類型和向量類型。在每一行中,輸入列的值將按指定順序連接到向量中。
**/
object Ex_VectorAssembler {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf(true).setMaster("local[2]").setAppName("spark ml")
val spark = SparkSession.builder().config(conf).getOrCreate()
val dataset = spark.createDataFrame(
Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
).toDF("id", "hour", "mobile", "userFeatures", "clicked")
val assembler = new VectorAssembler()
.setInputCols(Array("hour", "mobile", "userFeatures"))
.setOutputCol("features")
val output = assembler.transform(dataset)
println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
output.select("*").show(false)
spark.stop()
}
}
Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'
+---+----+------+--------------+-------+-----------------------+
|id |hour|mobile|userFeatures |clicked|features |
+---+----+------+--------------+-------+-----------------------+
|0 |18 |1.0 |[0.0,10.0,0.5]|1.0 |[18.0,1.0,0.0,10.0,0.5]|
+---+----+------+--------------+-------+-----------------------+