實用的朴素貝葉斯模型建模
建模過程主要是把文本轉化成向量然后再作分析
數據格式:
0,善良 美麗 1,丑陋 陰險 卑鄙 0,溫和 .......
注:前面是給文章貼的標簽,后面是文章的分詞,分詞可以找關於分詞的文章去查看,后面我也會寫關於分詞的文章
import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.SparkContext import org.apache.spark.ml.feature.Tokenizer import org.apache.spark.ml.feature.HashingTF import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator class CreatModel { } object CreatModel{ case class RawDataRecord(category: String, text: String) def main(args: Array[String]): Unit = { val config = new SparkConf().setAppName("createModel").setMaster("local[4]"); val sc =new SparkContext(config); val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate(); import spark.implicits._ //分數據
val Array(srcDF,testDF) = sc.textFile("D:\\decstop\\testFiles\\sougou").map { x => val data = x.split(",") RawDataRecord(data(0),data(1)) }.toDF().randomSplit(Array(0.7,0.3)) //分詞
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val wordsData = tokenizer.transform(srcDF) wordsData.show(false) val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val testwordsData = testtokenizer.transform(testDF) //文檔詞頻
val hashingTF =
new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100) val featurizedData = hashingTF.transform(wordsData) val testhashingTF =
new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100) val testfeaturizedData = testhashingTF.transform(testwordsData) //逆文檔詞頻
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val testidfModel = testidf.fit(testfeaturizedData) val testrescaledData = testidfModel.transform(testfeaturizedData) rescaledData.show(false) //轉換成貝葉斯的輸入格式
val trainDataRdd = rescaledData.select($"category",$"features").map { case Row(label: String, features:Vector) => LabeledPoint(label.toDouble, Vectors.dense(features.toArray)) } val testtrainDataRdd = testrescaledData.select($"category",$"features").map { case Row(label: String, features:Vector) => LabeledPoint(label.toDouble, Vectors.dense(features.toArray)) } val model =new NaiveBayes().fit(trainDataRdd) val predictions = model.transform(testtrainDataRdd) println("predictln out:"); predictions.show(); model.write.overwrite().save("resoult") //模型評估
val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy") val accuracy = evaluator.evaluate(predictions) println("accuracy out :") println("Accuracy:"+accuracy) } }