[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]
scala> val df = Seq(
| ("01", "Jack", "08012345566", "28","SALES", "1000", 1),
| ("02", "Tom", "08056586761", "19","MANAGEMENT", "2500", 1),
| ("03", "Mike", "08009097878", "25","MARKET", "2000", 1),
| ("04", "Tina", "07099661234", "30","LOGISTICS", "3000", 0),
| ("05", "Alex", "08019208960", "18","MARKET", "3500", 1),
| ("06", "Bob", "08011223344", "22","CLERK", "1500", 1),
| ("07", "Dvaid", "08022557788", "25","CLERK", "2500", 1),
| ("08", "Ben", "08080201682", "35","MARKET", "500", 1),
| ("09", "Allen", "08099206680", "20","MARKET", "2500", 1),
| ("10", "Caesar", "09011020806", "32","SALES", "1000", 1)).toDF("id", "name", "cellphone", "age", "department", "expense", "gender")
df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 5 more fields]
scala> df.show
+---+------+-----------+---+----------+-------+------+
| id| name| cellphone|age|department|expense|gender|
+---+------+-----------+---+----------+-------+------+
| 01| Jack|08012345566| 28| SALES| 1000| 1|
| 02| Tom|08056586761| 19|MANAGEMENT| 2500| 1|
| 03| Mike|08009097878| 25| MARKET| 2000| 1|
| 04| Tina|07099661234| 30| LOGISTICS| 3000| 0|
| 05| Alex|08019208960| 18| MARKET| 3500| 1|
| 06| Bob|08011223344| 22| CLERK| 1500| 1|
| 07| Dvaid|08022557788| 25| CLERK| 2500| 1|
| 08| Ben|08080201682| 35| MARKET| 500| 1|
| 09| Allen|08099206680| 20| MARKET| 2500| 1|
| 10|Caesar|09011020806| 32| SALES| 1000| 1|
+---+------+-----------+---+----------+-------+------+
scala> df.groupBy("department").agg(max("expense"), sum("expense")).show
+----------+------------+------------+
|department|max(expense)|sum(expense)|
+----------+------------+------------+
| CLERK| 2500| 4000.0|
| SALES| 1000| 2000.0|
| MARKET| 500| 8500.0|
| LOGISTICS| 3000| 3000.0|
|MANAGEMENT| 2500| 2500.0|
+----------+------------+------------+
scala> df.filter($"cellphone".contains("080")).show
+---+------+-----------+---+----------+-------+------+
| id| name| cellphone|age|department|expense|gender|
+---+------+-----------+---+----------+-------+------+
| 01| Jack|08012345566| 28| SALES| 1000| 1|
| 02| Tom|08056586761| 19|MANAGEMENT| 2500| 1|
| 03| Mike|08009097878| 25| MARKET| 2000| 1|
| 05| Alex|08019208960| 18| MARKET| 3500| 1|
| 06| Bob|08011223344| 22| CLERK| 1500| 1|
| 07| Dvaid|08022557788| 25| CLERK| 2500| 1|
| 08| Ben|08080201682| 35| MARKET| 500| 1|
| 09| Allen|08099206680| 20| MARKET| 2500| 1|
| 10|Caesar|09011020806| 32| SALES| 1000| 1|
+---+------+-----------+---+----------+-------+------+
scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(false)
+----------+------------+
|department|sum(expense)|
+----------+------------+
|CLERK |4000.0 |
|MANAGEMENT|2500.0 |
|MARKET |8500.0 |
|SALES |2000.0 |
+----------+------------+
scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,false)
+----------+------------+
|department|sum(expense)|
+----------+------------+
|CLERK |4000.0 |
+----------+------------+
only showing top 1 row
scala> df.filter($"cellphone".contains("080")).groupBy($"department").agg(sum($"expense")).orderBy($"department").show(1,true)
+----------+------------+
|department|sum(expense)|
+----------+------------+
| CLERK| 4000.0|
+----------+------------+
only showing top 1 row