梯度提升樹(GBT)是決策樹的集合。 GBT迭代地訓練決策樹以便使損失函數最小化。 spark.ml實現支持GBT用於二進制分類和回歸,可以使用連續和分類特征。
GBDT的優點
GBDT和隨機森林一樣,都具備決策樹的一些優點:
(1)可以處理類別特征和連續特征;
(2)不需要對數據進行標准化預處理;
(3)可以分析特征之間的相互影響
值得注意的是,Spark中的GBDT目前還不能處理多分類問題,僅可以用於二分類和回歸問題。(Spark隨機森林可以處理多分類問題)
導入包
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrameReader
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.DataFrameStatFunctions
import org.apache.spark.sql.functions._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer }
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.regression.{ GBTRegressionModel, GBTRegressor }
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
導入數據源
val spark = SparkSession.builder().appName("Spark Gradient-boosted tree regression").config("spark.some.config.option", "some-value").getOrCreate()
// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._
val dataList: List[(Double, String, Double, Double, String, Double, Double, Double, Double)] = List(
(0, "male", 37, 10, "no", 3, 18, 7, 4),
(0, "female", 27, 4, "no", 4, 14, 6, 4),
(0, "female", 32, 15, "yes", 1, 12, 1, 4),
(0, "male", 57, 15, "yes", 5, 18, 6, 5),
(0, "male", 22, 0.75, "no", 2, 17, 6, 3),
(0, "female", 32, 1.5, "no", 2, 17, 5, 5),
(0, "female", 22, 0.75, "no", 2, 12, 1, 3),
(0, "male", 57, 15, "yes", 2, 14, 4, 4),
(0, "female", 32, 15, "yes", 4, 16, 1, 2),
(0, "male", 22, 1.5, "no", 4, 14, 4, 5),
(0, "male", 37, 15, "yes", 2, 20, 7, 2),
(0, "male", 27, 4, "yes", 4, 18, 6, 4),
(0, "male", 47, 15, "yes", 5, 17, 6, 4),
(0, "female", 22, 1.5, "no", 2, 17, 5, 4),
(0, "female", 27, 4, "no", 4, 14, 5, 4),
(0, "female", 37, 15, "yes", 1, 17, 5, 5),
(0, "female", 37, 15, "yes", 2, 18, 4, 3),
(0, "female", 22, 0.75, "no", 3, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 2, 14, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 10, "yes", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 37, 4, "yes", 2, 20, 6, 4),
(0, "female", 22, 1.5, "no", 2, 18, 5, 5),
(0, "female", 27, 7, "no", 4, 16, 1, 5),
(0, "male", 42, 15, "yes", 5, 20, 6, 4),
(0, "male", 27, 4, "yes", 3, 16, 5, 5),
(0, "female", 27, 4, "yes", 3, 17, 5, 4),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 27, 0.417, "no", 4, 17, 6, 4),
(0, "female", 42, 15, "yes", 5, 14, 5, 4),
(0, "male", 32, 4, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 1, 4),
(0, "female", 22, 4, "no", 4, 17, 5, 5),
(0, "male", 22, 1.5, "yes", 1, 14, 3, 5),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "male", 32, 10, "yes", 5, 20, 6, 5),
(0, "male", 52, 15, "yes", 5, 18, 6, 3),
(0, "female", 22, 0.417, "no", 5, 14, 1, 4),
(0, "female", 27, 4, "yes", 2, 18, 6, 1),
(0, "female", 32, 7, "yes", 5, 17, 5, 3),
(0, "male", 22, 4, "no", 3, 16, 5, 5),
(0, "female", 27, 7, "yes", 4, 18, 6, 5),
(0, "female", 42, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 1.5, "yes", 4, 16, 3, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "female", 22, 0.75, "no", 5, 14, 3, 5),
(0, "male", 32, 7, "yes", 2, 20, 6, 4),
(0, "male", 27, 4, "yes", 5, 20, 6, 5),
(0, "male", 27, 10, "yes", 4, 20, 6, 4),
(0, "male", 22, 4, "no", 1, 18, 5, 5),
(0, "female", 37, 15, "yes", 4, 14, 3, 1),
(0, "male", 22, 1.5, "yes", 5, 16, 4, 4),
(0, "female", 37, 15, "yes", 4, 17, 1, 5),
(0, "female", 27, 0.75, "no", 4, 17, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 5, 14, 7, 2),
(0, "male", 37, 10, "yes", 3, 20, 6, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "no", 2, 18, 4, 5),
(0, "male", 32, 7, "no", 4, 20, 6, 4),
(0, "male", 42, 15, "yes", 2, 17, 3, 5),
(0, "male", 37, 10, "yes", 4, 20, 6, 4),
(0, "female", 47, 15, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 5, 16, 5, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 4),
(0, "female", 27, 4, "no", 3, 17, 5, 5),
(0, "female", 32, 10, "yes", 5, 14, 4, 5),
(0, "female", 22, 0.125, "no", 2, 12, 5, 5),
(0, "male", 47, 15, "yes", 4, 14, 4, 3),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 27, 7, "yes", 4, 16, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 17, 6, 5),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 57, 15, "yes", 2, 14, 7, 2),
(0, "male", 17.5, 1.5, "yes", 3, 18, 6, 5),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.75, "no", 2, 16, 3, 4),
(0, "male", 42, 4, "no", 4, 17, 3, 3),
(0, "female", 22, 1.5, "yes", 4, 12, 1, 5),
(0, "female", 22, 0.417, "no", 1, 17, 6, 4),
(0, "female", 32, 15, "yes", 4, 17, 5, 5),
(0, "female", 27, 1.5, "no", 3, 18, 5, 2),
(0, "female", 22, 1.5, "yes", 3, 14, 1, 5),
(0, "female", 37, 15, "yes", 3, 14, 1, 4),
(0, "female", 32, 15, "yes", 4, 14, 3, 4),
(0, "male", 37, 10, "yes", 2, 14, 5, 3),
(0, "male", 37, 10, "yes", 4, 16, 5, 4),
(0, "male", 57, 15, "yes", 5, 20, 5, 3),
(0, "male", 27, 0.417, "no", 1, 16, 3, 4),
(0, "female", 42, 15, "yes", 5, 14, 1, 5),
(0, "male", 57, 15, "yes", 3, 16, 6, 1),
(0, "male", 37, 10, "yes", 1, 16, 6, 4),
(0, "male", 37, 15, "yes", 3, 17, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "female", 27, 10, "yes", 5, 14, 1, 5),
(0, "male", 37, 10, "yes", 2, 18, 6, 4),
(0, "female", 22, 0.125, "no", 4, 12, 4, 5),
(0, "male", 57, 15, "yes", 5, 20, 6, 5),
(0, "female", 37, 15, "yes", 4, 18, 6, 4),
(0, "male", 22, 4, "yes", 4, 14, 6, 4),
(0, "male", 27, 7, "yes", 4, 18, 5, 4),
(0, "male", 57, 15, "yes", 4, 20, 5, 4),
(0, "male", 32, 15, "yes", 3, 14, 6, 3),
(0, "female", 22, 1.5, "no", 2, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 15, "yes", 4, 17, 6, 5),
(0, "female", 32, 1.5, "no", 5, 18, 5, 5),
(0, "male", 42, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "no", 3, 16, 5, 4),
(0, "male", 37, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 3, 2),
(0, "male", 32, 10, "no", 5, 18, 6, 4),
(0, "female", 22, 0.75, "no", 4, 16, 1, 5),
(0, "female", 27, 7, "yes", 4, 12, 2, 4),
(0, "female", 27, 7, "yes", 2, 16, 2, 5),
(0, "female", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 42, 15, "yes", 4, 17, 5, 3),
(0, "female", 27, 7, "yes", 2, 16, 1, 2),
(0, "female", 22, 1.5, "no", 3, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 6, 5),
(0, "female", 22, 0.125, "no", 2, 14, 4, 5),
(0, "male", 27, 1.5, "no", 4, 16, 5, 5),
(0, "male", 32, 1.5, "no", 2, 18, 6, 5),
(0, "male", 27, 1.5, "no", 2, 17, 6, 5),
(0, "female", 27, 10, "yes", 4, 16, 1, 3),
(0, "male", 42, 15, "yes", 4, 18, 6, 5),
(0, "female", 27, 1.5, "no", 2, 16, 6, 5),
(0, "male", 27, 4, "no", 2, 18, 6, 3),
(0, "female", 32, 10, "yes", 3, 14, 5, 3),
(0, "female", 32, 15, "yes", 3, 18, 5, 4),
(0, "female", 22, 0.75, "no", 2, 18, 6, 5),
(0, "female", 37, 15, "yes", 2, 16, 1, 4),
(0, "male", 27, 4, "yes", 4, 20, 5, 5),
(0, "male", 27, 4, "no", 1, 20, 5, 4),
(0, "female", 27, 10, "yes", 2, 12, 1, 4),
(0, "female", 32, 15, "yes", 5, 18, 6, 4),
(0, "male", 27, 7, "yes", 5, 12, 5, 3),
(0, "male", 52, 15, "yes", 2, 18, 5, 4),
(0, "male", 27, 4, "no", 3, 20, 6, 3),
(0, "male", 37, 4, "yes", 1, 18, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 4),
(0, "female", 52, 15, "yes", 5, 12, 1, 3),
(0, "female", 57, 15, "yes", 4, 16, 6, 4),
(0, "male", 27, 7, "yes", 1, 16, 5, 4),
(0, "male", 37, 7, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "no", 2, 14, 4, 3),
(0, "male", 32, 4, "yes", 2, 18, 5, 3),
(0, "male", 37, 15, "yes", 4, 20, 6, 3),
(0, "male", 22, 0.75, "yes", 2, 14, 4, 3),
(0, "male", 42, 15, "yes", 4, 20, 6, 3),
(0, "female", 52, 15, "yes", 5, 17, 1, 1),
(0, "female", 37, 15, "yes", 4, 14, 1, 2),
(0, "male", 27, 7, "yes", 4, 14, 5, 3),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 18, 6, 5),
(0, "female", 27, 4, "yes", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 5, 18, 6, 5),
(0, "female", 47, 15, "yes", 5, 12, 5, 4),
(0, "female", 32, 10, "yes", 3, 17, 1, 4),
(0, "female", 27, 1.5, "yes", 4, 17, 1, 2),
(0, "female", 57, 15, "yes", 2, 18, 5, 2),
(0, "female", 22, 1.5, "no", 4, 14, 5, 4),
(0, "male", 42, 15, "yes", 3, 14, 3, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 2),
(0, "male", 57, 15, "yes", 4, 20, 6, 5),
(0, "female", 22, 0.125, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 4, 14, 1, 5),
(0, "female", 42, 15, "yes", 3, 18, 5, 4),
(0, "female", 27, 1.5, "no", 2, 18, 6, 5),
(0, "male", 32, 0.125, "yes", 2, 18, 5, 2),
(0, "female", 27, 4, "no", 3, 16, 5, 4),
(0, "female", 27, 10, "yes", 2, 16, 1, 4),
(0, "female", 32, 7, "yes", 4, 16, 1, 3),
(0, "female", 37, 15, "yes", 4, 14, 5, 4),
(0, "female", 42, 15, "yes", 5, 17, 6, 2),
(0, "male", 32, 1.5, "yes", 4, 14, 6, 5),
(0, "female", 32, 4, "yes", 3, 17, 5, 3),
(0, "female", 37, 7, "no", 4, 18, 5, 5),
(0, "female", 22, 0.417, "yes", 3, 14, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 5),
(0, "male", 27, 0.75, "no", 3, 16, 5, 5),
(0, "male", 27, 4, "yes", 2, 20, 5, 5),
(0, "male", 32, 10, "yes", 4, 16, 4, 5),
(0, "male", 32, 15, "yes", 1, 14, 5, 5),
(0, "male", 22, 0.75, "no", 3, 17, 4, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 27, 0.417, "yes", 4, 20, 5, 4),
(0, "male", 37, 15, "yes", 4, 20, 5, 4),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 22, 4, "yes", 1, 18, 5, 4),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "male", 52, 15, "yes", 4, 14, 6, 2),
(0, "female", 22, 1.5, "no", 4, 17, 5, 5),
(0, "male", 32, 4, "yes", 5, 14, 3, 5),
(0, "male", 32, 4, "yes", 2, 14, 3, 5),
(0, "female", 22, 1.5, "no", 3, 16, 6, 5),
(0, "male", 27, 0.75, "no", 2, 18, 3, 3),
(0, "female", 22, 7, "yes", 2, 14, 5, 2),
(0, "female", 27, 0.75, "no", 2, 17, 5, 3),
(0, "female", 37, 15, "yes", 4, 12, 1, 2),
(0, "female", 22, 1.5, "no", 1, 14, 1, 5),
(0, "female", 37, 10, "no", 2, 12, 4, 4),
(0, "female", 37, 15, "yes", 4, 18, 5, 3),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 22, 4, "no", 2, 18, 5, 5),
(0, "male", 52, 7, "yes", 2, 20, 6, 2),
(0, "male", 27, 0.75, "no", 2, 17, 5, 5),
(0, "female", 27, 4, "no", 2, 17, 4, 5),
(0, "male", 42, 1.5, "no", 5, 20, 6, 5),
(0, "male", 22, 1.5, "no", 4, 17, 6, 5),
(0, "male", 22, 4, "no", 4, 17, 5, 3),
(0, "female", 22, 4, "yes", 1, 14, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 4, 5),
(0, "female", 37, 10, "yes", 3, 16, 6, 3),
(0, "male", 42, 15, "yes", 4, 17, 6, 5),
(0, "female", 47, 15, "yes", 4, 17, 5, 5),
(0, "male", 22, 1.5, "no", 4, 16, 5, 4),
(0, "female", 32, 10, "yes", 3, 12, 1, 4),
(0, "female", 22, 7, "yes", 1, 14, 3, 5),
(0, "female", 32, 10, "yes", 4, 17, 5, 4),
(0, "male", 27, 1.5, "yes", 2, 16, 2, 4),
(0, "male", 37, 15, "yes", 4, 14, 5, 5),
(0, "male", 42, 4, "yes", 3, 14, 4, 5),
(0, "female", 37, 15, "yes", 5, 14, 5, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 42, 15, "yes", 4, 18, 6, 5),
(0, "male", 27, 4, "no", 4, 18, 6, 4),
(0, "male", 22, 0.75, "no", 4, 18, 6, 5),
(0, "male", 27, 4, "yes", 4, 14, 5, 3),
(0, "female", 22, 0.75, "no", 5, 18, 1, 5),
(0, "female", 52, 15, "yes", 5, 9, 5, 5),
(0, "male", 32, 10, "yes", 3, 14, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 4, 4),
(0, "male", 32, 7, "yes", 2, 20, 5, 4),
(0, "female", 42, 15, "yes", 3, 18, 1, 4),
(0, "male", 32, 15, "yes", 1, 16, 5, 5),
(0, "male", 27, 4, "yes", 3, 18, 5, 5),
(0, "female", 32, 15, "yes", 4, 12, 3, 4),
(0, "male", 22, 0.75, "yes", 3, 14, 2, 4),
(0, "female", 22, 1.5, "no", 3, 16, 5, 3),
(0, "female", 42, 15, "yes", 4, 14, 3, 5),
(0, "female", 52, 15, "yes", 3, 16, 5, 4),
(0, "male", 37, 15, "yes", 5, 20, 6, 4),
(0, "female", 47, 15, "yes", 4, 12, 2, 3),
(0, "male", 57, 15, "yes", 2, 20, 6, 4),
(0, "male", 32, 7, "yes", 4, 17, 5, 5),
(0, "female", 27, 7, "yes", 4, 17, 1, 4),
(0, "male", 22, 1.5, "no", 1, 18, 6, 5),
(0, "female", 22, 4, "yes", 3, 9, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 1, 5),
(0, "male", 42, 15, "yes", 2, 20, 6, 4),
(0, "male", 57, 15, "yes", 4, 9, 2, 4),
(0, "female", 27, 7, "yes", 2, 18, 1, 5),
(0, "female", 22, 4, "yes", 3, 14, 1, 5),
(0, "male", 37, 15, "yes", 4, 14, 5, 3),
(0, "male", 32, 7, "yes", 1, 18, 6, 4),
(0, "female", 22, 1.5, "no", 2, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 3, 12, 1, 3),
(0, "male", 52, 15, "yes", 2, 14, 5, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 1),
(0, "female", 32, 10, "yes", 2, 14, 5, 5),
(0, "male", 42, 15, "yes", 4, 20, 4, 5),
(0, "female", 27, 4, "yes", 3, 18, 4, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 5),
(0, "male", 27, 1.5, "no", 3, 18, 5, 5),
(0, "female", 22, 0.125, "no", 2, 16, 6, 3),
(0, "male", 32, 10, "yes", 2, 20, 6, 3),
(0, "female", 27, 4, "no", 4, 18, 5, 4),
(0, "female", 27, 7, "yes", 2, 12, 5, 1),
(0, "male", 32, 4, "yes", 5, 18, 6, 3),
(0, "female", 37, 15, "yes", 2, 17, 5, 5),
(0, "male", 47, 15, "no", 4, 20, 6, 4),
(0, "male", 27, 1.5, "no", 1, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 18, 1, 4),
(0, "female", 32, 7, "yes", 4, 17, 5, 4),
(0, "female", 42, 15, "yes", 3, 14, 1, 3),
(0, "female", 27, 7, "yes", 3, 16, 1, 4),
(0, "male", 27, 1.5, "no", 3, 16, 4, 2),
(0, "male", 22, 1.5, "no", 3, 16, 3, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 2),
(0, "female", 37, 15, "yes", 2, 18, 5, 4),
(0, "female", 37, 7, "yes", 3, 14, 4, 4),
(0, "male", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 37, 15, "yes", 5, 20, 5, 4),
(0, "female", 22, 1.5, "no", 4, 16, 5, 3),
(0, "female", 32, 10, "yes", 4, 16, 1, 5),
(0, "male", 27, 4, "no", 2, 17, 5, 3),
(0, "female", 22, 0.417, "no", 4, 14, 5, 5),
(0, "female", 27, 4, "no", 2, 18, 5, 5),
(0, "male", 37, 15, "yes", 4, 18, 5, 3),
(0, "male", 37, 10, "yes", 5, 20, 7, 4),
(0, "female", 27, 7, "yes", 2, 14, 4, 2),
(0, "male", 32, 4, "yes", 2, 16, 5, 5),
(0, "male", 32, 4, "yes", 2, 16, 6, 4),
(0, "male", 22, 1.5, "no", 3, 18, 4, 5),
(0, "female", 22, 4, "yes", 4, 14, 3, 4),
(0, "female", 17.5, 0.75, "no", 2, 18, 5, 4),
(0, "male", 32, 10, "yes", 4, 20, 4, 5),
(0, "female", 32, 0.75, "no", 5, 14, 3, 3),
(0, "male", 37, 15, "yes", 4, 17, 5, 3),
(0, "male", 32, 4, "no", 3, 14, 4, 5),
(0, "female", 27, 1.5, "no", 2, 17, 3, 2),
(0, "female", 22, 7, "yes", 4, 14, 1, 5),
(0, "male", 47, 15, "yes", 5, 14, 6, 5),
(0, "male", 27, 4, "yes", 1, 16, 4, 4),
(0, "female", 37, 15, "yes", 5, 14, 1, 3),
(0, "male", 42, 4, "yes", 4, 18, 5, 5),
(0, "female", 32, 4, "yes", 2, 14, 1, 5),
(0, "male", 52, 15, "yes", 2, 14, 7, 4),
(0, "female", 22, 1.5, "no", 2, 16, 1, 4),
(0, "male", 52, 15, "yes", 4, 12, 2, 4),
(0, "female", 22, 0.417, "no", 3, 17, 1, 5),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "male", 27, 4, "yes", 4, 20, 6, 4),
(0, "female", 32, 15, "yes", 4, 14, 1, 5),
(0, "female", 27, 1.5, "no", 2, 16, 3, 5),
(0, "male", 32, 4, "no", 1, 20, 6, 5),
(0, "male", 37, 15, "yes", 3, 20, 6, 4),
(0, "female", 32, 10, "no", 2, 16, 6, 5),
(0, "female", 32, 10, "yes", 5, 14, 5, 5),
(0, "male", 37, 1.5, "yes", 4, 18, 5, 3),
(0, "male", 32, 1.5, "no", 2, 18, 4, 4),
(0, "female", 32, 10, "yes", 4, 14, 1, 4),
(0, "female", 47, 15, "yes", 4, 18, 5, 4),
(0, "female", 27, 10, "yes", 5, 12, 1, 5),
(0, "male", 27, 4, "yes", 3, 16, 4, 5),
(0, "female", 37, 15, "yes", 4, 12, 4, 2),
(0, "female", 27, 0.75, "no", 4, 16, 5, 5),
(0, "female", 37, 15, "yes", 4, 16, 1, 5),
(0, "female", 32, 15, "yes", 3, 16, 1, 5),
(0, "female", 27, 10, "yes", 2, 16, 1, 5),
(0, "male", 27, 7, "no", 2, 20, 6, 5),
(0, "female", 37, 15, "yes", 2, 14, 1, 3),
(0, "male", 27, 1.5, "yes", 2, 17, 4, 4),
(0, "female", 22, 0.75, "yes", 2, 14, 1, 5),
(0, "male", 22, 4, "yes", 4, 14, 2, 4),
(0, "male", 42, 0.125, "no", 4, 17, 6, 4),
(0, "male", 27, 1.5, "yes", 4, 18, 6, 5),
(0, "male", 27, 7, "yes", 3, 16, 6, 3),
(0, "female", 52, 15, "yes", 4, 14, 1, 3),
(0, "male", 27, 1.5, "no", 5, 20, 5, 2),
(0, "female", 27, 1.5, "no", 2, 16, 5, 5),
(0, "female", 27, 1.5, "no", 3, 17, 5, 5),
(0, "male", 22, 0.125, "no", 5, 16, 4, 4),
(0, "female", 27, 4, "yes", 4, 16, 1, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "female", 47, 15, "yes", 2, 14, 5, 5),
(0, "female", 32, 15, "yes", 3, 14, 5, 3),
(0, "male", 42, 7, "yes", 2, 16, 5, 5),
(0, "male", 22, 0.75, "no", 4, 16, 6, 4),
(0, "male", 27, 0.125, "no", 3, 20, 6, 5),
(0, "male", 32, 10, "yes", 3, 20, 6, 5),
(0, "female", 22, 0.417, "no", 5, 14, 4, 5),
(0, "female", 47, 15, "yes", 5, 14, 1, 4),
(0, "female", 32, 10, "yes", 3, 14, 1, 5),
(0, "male", 57, 15, "yes", 4, 17, 5, 5),
(0, "male", 27, 4, "yes", 3, 20, 6, 5),
(0, "female", 32, 7, "yes", 4, 17, 1, 5),
(0, "female", 37, 10, "yes", 4, 16, 1, 5),
(0, "female", 32, 10, "yes", 1, 18, 1, 4),
(0, "female", 22, 4, "no", 3, 14, 1, 4),
(0, "female", 27, 7, "yes", 4, 14, 3, 2),
(0, "male", 57, 15, "yes", 5, 18, 5, 2),
(0, "male", 32, 7, "yes", 2, 18, 5, 5),
(0, "female", 27, 1.5, "no", 4, 17, 1, 3),
(0, "male", 22, 1.5, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "yes", 4, 14, 5, 4),
(0, "female", 32, 7, "yes", 3, 16, 1, 5),
(0, "female", 47, 15, "yes", 3, 16, 5, 4),
(0, "female", 22, 0.75, "no", 3, 16, 1, 5),
(0, "female", 22, 1.5, "yes", 2, 14, 5, 5),
(0, "female", 27, 4, "yes", 1, 16, 5, 5),
(0, "male", 52, 15, "yes", 4, 16, 5, 5),
(0, "male", 32, 10, "yes", 4, 20, 6, 5),
(0, "male", 47, 15, "yes", 4, 16, 6, 4),
(0, "female", 27, 7, "yes", 2, 14, 1, 2),
(0, "female", 22, 1.5, "no", 4, 14, 4, 5),
(0, "female", 32, 10, "yes", 2, 16, 5, 4),
(0, "female", 22, 0.75, "no", 2, 16, 5, 4),
(0, "female", 22, 1.5, "no", 2, 16, 5, 5),
(0, "female", 42, 15, "yes", 3, 18, 6, 4),
(0, "female", 27, 7, "yes", 5, 14, 4, 5),
(0, "male", 42, 15, "yes", 4, 16, 4, 4),
(0, "female", 57, 15, "yes", 3, 18, 5, 2),
(0, "male", 42, 15, "yes", 3, 18, 6, 2),
(0, "female", 32, 7, "yes", 2, 14, 1, 2),
(0, "male", 22, 4, "no", 5, 12, 4, 5),
(0, "female", 22, 1.5, "no", 1, 16, 6, 5),
(0, "female", 22, 0.75, "no", 1, 14, 4, 5),
(0, "female", 32, 15, "yes", 4, 12, 1, 5),
(0, "male", 22, 1.5, "no", 2, 18, 5, 3),
(0, "male", 27, 4, "yes", 5, 17, 2, 5),
(0, "female", 27, 4, "yes", 4, 12, 1, 5),
(0, "male", 42, 15, "yes", 5, 18, 5, 4),
(0, "male", 32, 1.5, "no", 2, 20, 7, 3),
(0, "male", 57, 15, "no", 4, 9, 3, 1),
(0, "male", 37, 7, "no", 4, 18, 5, 5),
(0, "male", 52, 15, "yes", 2, 17, 5, 4),
(0, "male", 47, 15, "yes", 4, 17, 6, 5),
(0, "female", 27, 7, "no", 2, 17, 5, 4),
(0, "female", 27, 7, "yes", 4, 14, 5, 5),
(0, "female", 22, 4, "no", 2, 14, 3, 3),
(0, "male", 37, 7, "yes", 2, 20, 6, 5),
(0, "male", 27, 7, "no", 4, 12, 4, 3),
(0, "male", 42, 10, "yes", 4, 18, 6, 4),
(0, "female", 22, 1.5, "no", 3, 14, 1, 5),
(0, "female", 22, 4, "yes", 2, 14, 1, 3),
(0, "female", 57, 15, "no", 4, 20, 6, 5),
(0, "male", 37, 15, "yes", 4, 14, 4, 3),
(0, "female", 27, 7, "yes", 3, 18, 5, 5),
(0, "female", 17.5, 10, "no", 4, 14, 4, 5),
(0, "male", 22, 4, "yes", 4, 16, 5, 5),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(0, "female", 37, 15, "yes", 2, 14, 5, 1),
(0, "female", 22, 1.5, "no", 5, 14, 1, 4),
(0, "male", 27, 7, "yes", 2, 20, 5, 4),
(0, "male", 27, 4, "yes", 4, 14, 5, 5),
(0, "male", 22, 0.125, "no", 1, 16, 3, 5),
(0, "female", 27, 7, "yes", 4, 14, 1, 4),
(0, "female", 32, 15, "yes", 5, 16, 5, 3),
(0, "male", 32, 10, "yes", 4, 18, 5, 4),
(0, "female", 32, 15, "yes", 2, 14, 3, 4),
(0, "female", 22, 1.5, "no", 3, 17, 5, 5),
(0, "male", 27, 4, "yes", 4, 17, 4, 4),
(0, "female", 52, 15, "yes", 5, 14, 1, 5),
(0, "female", 27, 7, "yes", 2, 12, 1, 2),
(0, "female", 27, 7, "yes", 3, 12, 1, 4),
(0, "female", 42, 15, "yes", 2, 14, 1, 4),
(0, "female", 42, 15, "yes", 4, 14, 5, 4),
(0, "male", 27, 7, "yes", 4, 14, 3, 3),
(0, "male", 27, 7, "yes", 2, 20, 6, 2),
(0, "female", 42, 15, "yes", 3, 12, 3, 3),
(0, "male", 27, 4, "yes", 3, 16, 3, 5),
(0, "female", 27, 7, "yes", 3, 14, 1, 4),
(0, "female", 22, 1.5, "no", 2, 14, 4, 5),
(0, "female", 27, 4, "yes", 4, 14, 1, 4),
(0, "female", 22, 4, "no", 4, 14, 5, 5),
(0, "female", 22, 1.5, "no", 2, 16, 4, 5),
(0, "male", 47, 15, "no", 4, 14, 5, 4),
(0, "male", 37, 10, "yes", 2, 18, 6, 2),
(0, "male", 37, 15, "yes", 3, 17, 5, 4),
(0, "female", 27, 4, "yes", 2, 16, 1, 4),
(3, "male", 27, 1.5, "no", 3, 18, 4, 4),
(3, "female", 27, 4, "yes", 3, 17, 1, 5),
(7, "male", 37, 15, "yes", 5, 18, 6, 2),
(12, "female", 32, 10, "yes", 3, 17, 5, 2),
(1, "male", 22, 0.125, "no", 4, 16, 5, 5),
(1, "female", 22, 1.5, "yes", 2, 14, 1, 5),
(12, "male", 37, 15, "yes", 4, 14, 5, 2),
(7, "female", 22, 1.5, "no", 2, 14, 3, 4),
(2, "male", 37, 15, "yes", 2, 18, 6, 4),
(3, "female", 32, 15, "yes", 4, 12, 3, 2),
(1, "female", 37, 15, "yes", 4, 14, 4, 2),
(7, "female", 42, 15, "yes", 3, 17, 1, 4),
(12, "female", 42, 15, "yes", 5, 9, 4, 1),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 32, 15, "yes", 3, 14, 1, 2),
(3, "male", 27, 4, "no", 1, 18, 6, 5),
(7, "male", 37, 10, "yes", 2, 18, 7, 3),
(7, "female", 27, 4, "no", 3, 17, 5, 5),
(1, "male", 42, 15, "yes", 4, 16, 5, 5),
(1, "female", 47, 15, "yes", 5, 14, 4, 5),
(7, "female", 27, 4, "yes", 3, 18, 5, 4),
(1, "female", 27, 7, "yes", 5, 14, 1, 4),
(12, "male", 27, 1.5, "yes", 3, 17, 5, 4),
(12, "female", 27, 7, "yes", 4, 14, 6, 2),
(3, "female", 42, 15, "yes", 4, 16, 5, 4),
(7, "female", 27, 10, "yes", 4, 12, 7, 3),
(1, "male", 27, 1.5, "no", 2, 18, 5, 2),
(1, "male", 32, 4, "no", 4, 20, 6, 4),
(1, "female", 27, 7, "yes", 3, 14, 1, 3),
(3, "female", 32, 10, "yes", 4, 14, 1, 4),
(3, "male", 27, 4, "yes", 2, 18, 7, 2),
(1, "female", 17.5, 0.75, "no", 5, 14, 4, 5),
(1, "female", 32, 10, "yes", 4, 18, 1, 5),
(7, "female", 32, 7, "yes", 2, 17, 6, 4),
(7, "male", 37, 15, "yes", 2, 20, 6, 4),
(7, "female", 37, 10, "no", 1, 20, 5, 3),
(12, "female", 32, 10, "yes", 2, 16, 5, 5),
(7, "male", 52, 15, "yes", 2, 20, 6, 4),
(7, "female", 42, 15, "yes", 1, 12, 1, 3),
(1, "male", 52, 15, "yes", 2, 20, 6, 3),
(2, "male", 37, 15, "yes", 3, 18, 6, 5),
(12, "female", 22, 4, "no", 3, 12, 3, 4),
(12, "male", 27, 7, "yes", 1, 18, 6, 2),
(1, "male", 27, 4, "yes", 3, 18, 5, 5),
(12, "male", 47, 15, "yes", 4, 17, 6, 5),
(12, "female", 42, 15, "yes", 4, 12, 1, 1),
(7, "male", 27, 4, "no", 3, 14, 3, 4),
(7, "female", 32, 7, "yes", 4, 18, 4, 5),
(1, "male", 32, 0.417, "yes", 3, 12, 3, 4),
(3, "male", 47, 15, "yes", 5, 16, 5, 4),
(12, "male", 37, 15, "yes", 2, 20, 5, 4),
(7, "male", 22, 4, "yes", 2, 17, 6, 4),
(1, "male", 27, 4, "no", 2, 14, 4, 5),
(7, "female", 52, 15, "yes", 5, 16, 1, 3),
(1, "male", 27, 4, "no", 3, 14, 3, 3),
(1, "female", 27, 10, "yes", 4, 16, 1, 4),
(1, "male", 32, 7, "yes", 3, 14, 7, 4),
(7, "male", 32, 7, "yes", 2, 18, 4, 1),
(3, "male", 22, 1.5, "no", 1, 14, 3, 2),
(7, "male", 22, 4, "yes", 3, 18, 6, 4),
(7, "male", 42, 15, "yes", 4, 20, 6, 4),
(2, "female", 57, 15, "yes", 1, 18, 5, 4),
(7, "female", 32, 4, "yes", 3, 18, 5, 2),
(1, "male", 27, 4, "yes", 1, 16, 4, 4),
(7, "male", 32, 7, "yes", 4, 16, 1, 4),
(2, "male", 57, 15, "yes", 1, 17, 4, 4),
(7, "female", 42, 15, "yes", 4, 14, 5, 2),
(7, "male", 37, 10, "yes", 1, 18, 5, 3),
(3, "male", 42, 15, "yes", 3, 17, 6, 1),
(1, "female", 52, 15, "yes", 3, 14, 4, 4),
(2, "female", 27, 7, "yes", 3, 17, 5, 3),
(12, "male", 32, 7, "yes", 2, 12, 4, 2),
(1, "male", 22, 4, "no", 4, 14, 2, 5),
(3, "male", 27, 7, "yes", 3, 18, 6, 4),
(12, "female", 37, 15, "yes", 1, 18, 5, 5),
(7, "female", 32, 15, "yes", 3, 17, 1, 3),
(7, "female", 27, 7, "no", 2, 17, 5, 5),
(1, "female", 32, 7, "yes", 3, 17, 5, 3),
(1, "male", 32, 1.5, "yes", 2, 14, 2, 4),
(12, "female", 42, 15, "yes", 4, 14, 1, 2),
(7, "male", 32, 10, "yes", 3, 14, 5, 4),
(7, "male", 37, 4, "yes", 1, 20, 6, 3),
(1, "female", 27, 4, "yes", 2, 16, 5, 3),
(12, "female", 42, 15, "yes", 3, 14, 4, 3),
(1, "male", 27, 10, "yes", 5, 20, 6, 5),
(12, "male", 37, 10, "yes", 2, 20, 6, 2),
(12, "female", 27, 7, "yes", 1, 14, 3, 3),
(3, "female", 27, 7, "yes", 4, 12, 1, 2),
(3, "male", 32, 10, "yes", 2, 14, 4, 4),
(12, "female", 17.5, 0.75, "yes", 2, 12, 1, 3),
(12, "female", 32, 15, "yes", 3, 18, 5, 4),
(2, "female", 22, 7, "no", 4, 14, 4, 3),
(1, "male", 32, 7, "yes", 4, 20, 6, 5),
(7, "male", 27, 4, "yes", 2, 18, 6, 2),
(1, "female", 22, 1.5, "yes", 5, 14, 5, 3),
(12, "female", 32, 15, "no", 3, 17, 5, 1),
(12, "female", 42, 15, "yes", 2, 12, 1, 2),
(7, "male", 42, 15, "yes", 3, 20, 5, 4),
(12, "male", 32, 10, "no", 2, 18, 4, 2),
(12, "female", 32, 15, "yes", 3, 9, 1, 1),
(7, "male", 57, 15, "yes", 5, 20, 4, 5),
(12, "male", 47, 15, "yes", 4, 20, 6, 4),
(2, "female", 42, 15, "yes", 2, 17, 6, 3),
(12, "male", 37, 15, "yes", 3, 17, 6, 3),
(12, "male", 37, 15, "yes", 5, 17, 5, 2),
(7, "male", 27, 10, "yes", 2, 20, 6, 4),
(2, "male", 37, 15, "yes", 2, 16, 5, 4),
(12, "female", 32, 15, "yes", 1, 14, 5, 2),
(7, "male", 32, 10, "yes", 3, 17, 6, 3),
(2, "male", 37, 15, "yes", 4, 18, 5, 1),
(7, "female", 27, 1.5, "no", 2, 17, 5, 5),
(3, "female", 47, 15, "yes", 2, 17, 5, 2),
(12, "male", 37, 15, "yes", 2, 17, 5, 4),
(12, "female", 27, 4, "no", 2, 14, 5, 5),
(2, "female", 27, 10, "yes", 4, 14, 1, 5),
(1, "female", 22, 4, "yes", 3, 16, 1, 3),
(12, "male", 52, 7, "no", 4, 16, 5, 5),
(2, "female", 27, 4, "yes", 1, 16, 3, 5),
(7, "female", 37, 15, "yes", 2, 17, 6, 4),
(2, "female", 27, 4, "no", 1, 17, 3, 1),
(12, "female", 17.5, 0.75, "yes", 2, 12, 3, 5),
(7, "female", 32, 15, "yes", 5, 18, 5, 4),
(7, "female", 22, 4, "no", 1, 16, 3, 5),
(2, "male", 32, 4, "yes", 4, 18, 6, 4),
(1, "female", 22, 1.5, "yes", 3, 18, 5, 2),
(3, "female", 42, 15, "yes", 2, 17, 5, 4),
(1, "male", 32, 7, "yes", 4, 16, 4, 4),
(12, "male", 37, 15, "no", 3, 14, 6, 2),
(1, "male", 42, 15, "yes", 3, 16, 6, 3),
(1, "male", 27, 4, "yes", 1, 18, 5, 4),
(2, "male", 37, 15, "yes", 4, 20, 7, 3),
(7, "male", 37, 15, "yes", 3, 20, 6, 4),
(3, "male", 22, 1.5, "no", 2, 12, 3, 3),
(3, "male", 32, 4, "yes", 3, 20, 6, 2),
(2, "male", 32, 15, "yes", 5, 20, 6, 5),
(12, "female", 52, 15, "yes", 1, 18, 5, 5),
(12, "male", 47, 15, "no", 1, 18, 6, 5),
(3, "female", 32, 15, "yes", 4, 16, 4, 4),
(7, "female", 32, 15, "yes", 3, 14, 3, 2),
(7, "female", 27, 7, "yes", 4, 16, 1, 2),
(12, "male", 42, 15, "yes", 3, 18, 6, 2),
(7, "female", 42, 15, "yes", 2, 14, 3, 2),
(12, "male", 27, 7, "yes", 2, 17, 5, 4),
(3, "male", 32, 10, "yes", 4, 14, 4, 3),
(7, "male", 47, 15, "yes", 3, 16, 4, 2),
(1, "male", 22, 1.5, "yes", 1, 12, 2, 5),
(7, "female", 32, 10, "yes", 2, 18, 5, 4),
(2, "male", 32, 10, "yes", 2, 17, 6, 5),
(2, "male", 22, 7, "yes", 3, 18, 6, 2),
(1, "female", 32, 15, "yes", 3, 14, 1, 5))
val data = dataList.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
GBT建模
data.createOrReplaceTempView("data")
// 字符類型轉換成數值
val labelWhere = "affairs as label"
val genderWhere = "case when gender='female' then 0 else cast(1 as double) end as gender"
val childrenWhere = "case when children='no' then 0 else cast(1 as double) end as children"
val dataLabelDF = spark.sql(s"select $labelWhere, $genderWhere,age,yearsmarried,$childrenWhere,religiousness,education,occupation,rating from data")
val featuresArray = Array("gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
// 字段轉換成特征向量
val assembler = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
val vecDF: DataFrame = assembler.transform(dataLabelDF)
vecDF.show(10, truncate = false)
// 將數據分為訓練和測試集(30%進行測試)
val Array(trainingDF, testDF) = vecDF.randomSplit(Array(0.7, 0.3))
// 自動識別分類的特征,並對它們進行索引
// 具有大於5個不同的值的特征被視為連續。
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(5)
// 訓練GBT模型
val gbt = new GBTRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures").setImpurity("variance").setLossType("squared").setMaxIter(100).setMinInstancesPerNode(100)
// Chain indexer and GBT in a Pipeline.
val pipeline = new Pipeline().setStages(Array(featureIndexer, gbt))
// Train model. This also runs the indexer.
val model = pipeline.fit(trainingDF)
// 做出預測
val predictions = model.transform(testDF)
// 預測樣本展示
predictions.select("prediction", "label", "features").show(20,false)
// 選擇(預測標簽,實際標簽),並計算測試誤差。
val evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)
val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
println("Learned regression GBT model:\n" + gbtModel.toDebugString)
