1、take
獲取前n條記錄
scala> val t = sc.parallelize(1 to 10) t: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:27 scala> t.take(3) res0: Array[Int] = Array(1, 2, 3)
2、reverse
反轉列表
scala> val a = Seq(1,2,3,6,4) a: Seq[Int] = List(1, 2, 3, 6, 4) scala> a.reverse res6: Seq[Int] = List(4, 6, 3, 2, 1)
3、tail
獲取列表去除行首
scala> a.tail
res7: Seq[Int] = List(2, 3, 6, 4)
4、filter
過濾
scala> val a = Seq(1,2,3,6,4) a: Seq[Int] = List(1, 2, 3, 6, 4) scala> val d = a.flatMap(f=>{ | try{ | Some(f/(f-1)) | }catch{ | case e:Exception=>None | } | }).filter(_>1) d: Seq[Int] = List(2)
5、init
除去最后一個元素
scala> val t = sc.parallelize(1 to 10) t: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[380] at parallelize at <console>:40 scala> val c = t.take(5) c: Array[Int] = Array(1, 2, 3, 4, 5) scala> val d = c.init d: Array[Int] = Array(1, 2, 3, 4)
6、last
獲取最后一個元素
scala> val t = sc.parallelize(1 to 10).take(3) t: Array[Int] = Array(1, 2, 3) scala> t.last res55: Int = 3
7、randomSplit
數據分割
scala> c.take(10) res64: Array[Array[Double]] = Array(Array(9.0, 21.0, 3.0, 4.0, 5.0, 1.0), Array(21.0, 3.0, 4.0, 21.0, 5.0, 2.0), Array(6.0, 12.0, 45.0, 32.0, 32.0, 3.0)) scala> val Array(trainData, cvData, testData) = c.randomSplit(Array(0.8, 0.1, 0.1)) trainData: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[391] at randomSplit at <console>:50 cvData: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[392] at randomSplit at <console>:50 testData: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[393] at randomSplit at <console>:50
8、yield 生成集合
scala> for(i <- 1 to 10) yield i%3 res7: scala.collection.immutable.IndexedSeq[Int] = Vector(1, 2, 0, 1, 2, 0, 1, 2, 0, 1)