scala> import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession scala> val spark=SparkSession.builder().getOrCreate() spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2bdab835 //使支持RDDs轉換為DataFrames及后續sql操作 scala> import spark.implicits._ import spark.implicits._ scala> val df = spark.read.json("file:///usr/local/spark/examples/src/main/resources/people.json") df: org.apache.spark.sql.DataFrame = [age: bigint, name: string] scala> df.show() +----+-------+ | age| name| +----+-------+ |null|Michael| | 30| Andy| | 19| Justin| +----+-------+ // 打印模式信息 scala> df.printSchema() root |-- age: long (nullable = true) |-- name: string (nullable = true) // 選擇多列 scala> df.select(df("name"),df("age")+1).show() +-------+---------+ | name|(age + 1)| +-------+---------+ |Michael| null| | Andy| 31| | Justin| 20| +-------+---------+ // 條件過濾 scala> df.filter(df("age") > 20 ).show() +---+----+ |age|name| +---+----+ | 30|Andy| +---+----+ // 分組聚合 scala> df.groupBy("age").count().show() +----+-----+ | age|count| +----+-----+ | 19| 1| |null| 1| | 30| 1| +----+-----+ // 排序 scala> df.sort(df("age").desc).show() +----+-------+ | age| name| +----+-------+ | 30| Andy| | 19| Justin| |null|Michael| +----+-------+ //多列排序 scala> df.sort(df("age").desc, df("name").asc).show() +----+-------+ | age| name| +----+-------+ | 30| Andy| | 19| Justin| |null|Michael| +----+-------+ //對列進行重命名 scala> df.select(df("name").as("username"),df("age")).show() +--------+----+ |username| age| +--------+----+ | Michael|null| | Andy| 30| | Justin| 19| +--------+----+ //使用spark sql語句 scala>df.createTempView("table1") scala> spark.sql("select * from table1 limit 10")
以上是我們常用的dataframe的基礎操作
具體見一下博客
https://blog.csdn.net/dabokele/article/details/52802150
SparkSQL官網
http://spark.apache.org/docs/1.6.2/api/scala/index.html#org.apache.spark.sql.DataFrame