(1)編寫獨立應用程序實現數據去重
package my.scala import org.apache.spark.{SparkConf, SparkContext} object case2 { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local").setAppName("reduce") val sc = new SparkContext(conf) sc.setLogLevel("ERROR") //獲取數據 val two = sc.textFile("file:///usr/local/spark/text_4/sec") two.filter(_.trim().length>0) //trim()函數返回空格個數 .map(line=>(line.trim,""))//全部值當key,(key value,"") .groupByKey()//groupByKey,過濾重復的key value ,發送到總機器上匯總 .sortByKey() //按key value的自然順序排序 .keys.collect().foreach(println)//collect是將結果轉換為數組的形式 } }
(2)編寫獨立應用程序實現求平均值問題
package my.scala import org.apache.spark.{SparkConf, SparkContext} object pingjunzhi { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local").setAppName("reduce") val sc = new SparkContext(conf) sc.setLogLevel("ERROR") val fourth = sc.textFile("file:///usr/local/spark/text_4/thi") val res = fourth.filter(_.trim().length>0).map(line=>(line.split("\t")(0).trim(),line.split("\t")(1).trim().toInt)).groupByKey().map(x => { var num = 0.0 var sum = 0 for(i <- x._2){ sum = sum + i num = num +1 } val avg = sum/num val format = f"$avg%1.2f".toDouble (x._1,format) }).collect.foreach(x => println(x._1+"\t"+x._2)) } }