SparkSQL--内置函数--groupBy()-agg()


[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
         
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
         
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]


scala> val df = Seq(
     |     ("01", "Jack", "08012345566", "28","SALES", "1000", 1),
     |     ("02", "Tom",  "08056586761", "19","MANAGEMENT", "2500", 1),
     |     ("03", "Mike", "08009097878", "25","MARKET", "2000", 1),
     |     ("04", "Tina", "07099661234", "30","LOGISTICS", "3000", 0),
     |     ("05", "Alex", "08019208960", "18","MARKET", "3500", 1),
     |     ("06", "Bob", "08011223344", "22","CLERK", "1500", 1),
     |     ("07", "Dvaid", "08022557788", "25","CLERK", "2500", 1),
     |     ("08", "Ben", "08080201682", "35","MARKET", "500", 1),
     |     ("09", "Allen", "08099206680", "20","MARKET", "2500", 1),
     |     ("10", "Caesar", "09011020806", "32","SALES", "1000", 1)).toDF("id", "name", "cellphone", "age", "department", "expense", "gender")
df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 5 more fields]

scala> df.show
+---+------+-----------+---+----------+-------+------+
| id|  name|  cellphone|age|department|expense|gender|
+---+------+-----------+---+----------+-------+------+
| 01|  Jack|08012345566| 28|     SALES|   1000|     1|
| 02|   Tom|08056586761| 19|MANAGEMENT|   2500|     1|
| 03|  Mike|08009097878| 25|    MARKET|   2000|     1|
| 04|  Tina|07099661234| 30| LOGISTICS|   3000|     0|
| 05|  Alex|08019208960| 18|    MARKET|   3500|     1|
| 06|   Bob|08011223344| 22|     CLERK|   1500|     1|
| 07| Dvaid|08022557788| 25|     CLERK|   2500|     1|
| 08|   Ben|08080201682| 35|    MARKET|    500|     1|
| 09| Allen|08099206680| 20|    MARKET|   2500|     1|
| 10|Caesar|09011020806| 32|     SALES|   1000|     1|
+---+------+-----------+---+----------+-------+------+

scala> df.groupBy("department").agg(max("expense"), sum("expense")).show
+----------+------------+------------+
|department|max(expense)|sum(expense)|
+----------+------------+------------+
|     CLERK|        2500|      4000.0|
|     SALES|        1000|      2000.0|
|    MARKET|         500|      8500.0|
| LOGISTICS|        3000|      3000.0|
|MANAGEMENT|        2500|      2500.0|
+----------+------------+------------+

scala> df.filter($"cellphone".contains("080")).show
+---+------+-----------+---+----------+-------+------+
| id|  name|  cellphone|age|department|expense|gender|
+---+------+-----------+---+----------+-------+------+
| 01|  Jack|08012345566| 28|     SALES|   1000|     1|
| 02|   Tom|08056586761| 19|MANAGEMENT|   2500|     1|
| 03|  Mike|08009097878| 25|    MARKET|   2000|     1|
| 05|  Alex|08019208960| 18|    MARKET|   3500|     1|
| 06|   Bob|08011223344| 22|     CLERK|   1500|     1|
| 07| Dvaid|08022557788| 25|     CLERK|   2500|     1|
| 08|   Ben|08080201682| 35|    MARKET|    500|     1|
| 09| Allen|08099206680| 20|    MARKET|   2500|     1|
| 10|Caesar|09011020806| 32|     SALES|   1000|     1|
+---+------+-----------+---+----------+-------+------+

scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(false)
+----------+------------+
|department|sum(expense)|
+----------+------------+
|CLERK     |4000.0      |
|MANAGEMENT|2500.0      |
|MARKET    |8500.0      |
|SALES     |2000.0      |
+----------+------------+

scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,false)
+----------+------------+
|department|sum(expense)|
+----------+------------+
|CLERK     |4000.0      |
+----------+------------+
only showing top 1 row

scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,true)
+----------+------------+
|department|sum(expense)|
+----------+------------+
|     CLERK|      4000.0|
+----------+------------+
only showing top 1 row

  


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM