[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/ [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode [root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/ [root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore & [root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/ [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2] scala> val df = Seq( | ("01", "Jack", "08012345566", "28","SALES", "1000", 1), | ("02", "Tom", "08056586761", "19","MANAGEMENT", "2500", 1), | ("03", "Mike", "08009097878", "25","MARKET", "2000", 1), | ("04", "Tina", "07099661234", "30","LOGISTICS", "3000", 0), | ("05", "Alex", "08019208960", "18","MARKET", "3500", 1), | ("06", "Bob", "08011223344", "22","CLERK", "1500", 1), | ("07", "Dvaid", "08022557788", "25","CLERK", "2500", 1), | ("08", "Ben", "08080201682", "35","MARKET", "500", 1), | ("09", "Allen", "08099206680", "20","MARKET", "2500", 1), | ("10", "Caesar", "09011020806", "32","SALES", "1000", 1)).toDF("id", "name", "cellphone", "age", "department", "expense", "gender") df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 5 more fields] scala> df.show +---+------+-----------+---+----------+-------+------+ | id| name| cellphone|age|department|expense|gender| +---+------+-----------+---+----------+-------+------+ | 01| Jack|08012345566| 28| SALES| 1000| 1| | 02| Tom|08056586761| 19|MANAGEMENT| 2500| 1| | 03| Mike|08009097878| 25| MARKET| 2000| 1| | 04| Tina|07099661234| 30| LOGISTICS| 3000| 0| | 05| Alex|08019208960| 18| MARKET| 3500| 1| | 06| Bob|08011223344| 22| CLERK| 1500| 1| | 07| Dvaid|08022557788| 25| CLERK| 2500| 1| | 08| Ben|08080201682| 35| MARKET| 500| 1| | 09| Allen|08099206680| 20| MARKET| 2500| 1| | 10|Caesar|09011020806| 32| SALES| 1000| 1| +---+------+-----------+---+----------+-------+------+ scala> df.groupBy("department").agg(max("expense"), sum("expense")).show +----------+------------+------------+ |department|max(expense)|sum(expense)| +----------+------------+------------+ | CLERK| 2500| 4000.0| | SALES| 1000| 2000.0| | MARKET| 500| 8500.0| | LOGISTICS| 3000| 3000.0| |MANAGEMENT| 2500| 2500.0| +----------+------------+------------+ scala> df.filter($"cellphone".contains("080")).show +---+------+-----------+---+----------+-------+------+ | id| name| cellphone|age|department|expense|gender| +---+------+-----------+---+----------+-------+------+ | 01| Jack|08012345566| 28| SALES| 1000| 1| | 02| Tom|08056586761| 19|MANAGEMENT| 2500| 1| | 03| Mike|08009097878| 25| MARKET| 2000| 1| | 05| Alex|08019208960| 18| MARKET| 3500| 1| | 06| Bob|08011223344| 22| CLERK| 1500| 1| | 07| Dvaid|08022557788| 25| CLERK| 2500| 1| | 08| Ben|08080201682| 35| MARKET| 500| 1| | 09| Allen|08099206680| 20| MARKET| 2500| 1| | 10|Caesar|09011020806| 32| SALES| 1000| 1| +---+------+-----------+---+----------+-------+------+ scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(false) +----------+------------+ |department|sum(expense)| +----------+------------+ |CLERK |4000.0 | |MANAGEMENT|2500.0 | |MARKET |8500.0 | |SALES |2000.0 | +----------+------------+ scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,false) +----------+------------+ |department|sum(expense)| +----------+------------+ |CLERK |4000.0 | +----------+------------+ only showing top 1 row scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,true) +----------+------------+ |department|sum(expense)| +----------+------------+ | CLERK| 4000.0| +----------+------------+ only showing top 1 row