[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/ [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode [root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/ [root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore & [root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/ [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2] scala> val df = Seq( | ("01", "Jack", "2020-06-05"), | ("02", "Tom", "2020-01-01"), | ("03", "Mike", "2020-09-01"), | ("04", "Tina", "2020-09-01"), | ("05", "Alex", "2020-06-10"), | ("06", "Bob", "2020-01-01"), | ("07", "David", "2020-09-01"), | ("08", "Ben", "2020-09-01"), | ("09", "Allen", "2020-06-05"), | ("10", "Caesar","2020-01-01") | ).toDF("id", "name", "entrytime") df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field] // 获取最大入职时间 scala> df.select(max($"entrytime")).show +--------------+ |max(entrytime)| +--------------+ | 2020-09-01| +--------------+ // 获取最小入职时间 scala> df.select(min($"entrytime")).show +--------------+ |min(entrytime)| +--------------+ | 2020-01-01| +--------------+ // 统计字段姓名的记录数 scala> df.select("name").count res2: Long = 10 // 统计字段姓名中含有"A"的记录数 scala> df.select("name").filter($"name".contains("A")).count res3: Long = 2 // 过滤出姓名中含有"A"的记录 scala> df.select("id", "name", "entrytime").filter($"name".contains("A")).show() +---+-----+----------+ | id| name| entrytime| +---+-----+----------+ | 05| Alex|2020-06-10| | 09|Allen|2020-06-05| +---+-----+----------+ // 按入职时间正序排序 scala> df.select(col("*")).orderBy("entrytime").show +---+------+----------+ | id| name| entrytime| +---+------+----------+ | 06| Bob|2020-01-01| | 10|Caesar|2020-01-01| | 02| Tom|2020-01-01| | 01| Jack|2020-06-05| | 09| Allen|2020-06-05| | 05| Alex|2020-06-10| | 07| David|2020-09-01| | 08| Ben|2020-09-01| | 03| Mike|2020-09-01| | 04| Tina|2020-09-01| +---+------+----------+ // 按入职时间倒序排序 scala> df.select(col("*")).orderBy($"entrytime".desc).show +---+------+----------+ | id| name| entrytime| +---+------+----------+ | 04| Tina|2020-09-01| | 03| Mike|2020-09-01| | 07| David|2020-09-01| | 08| Ben|2020-09-01| | 05| Alex|2020-06-10| | 01| Jack|2020-06-05| | 09| Allen|2020-06-05| | 02| Tom|2020-01-01| | 06| Bob|2020-01-01| | 10|Caesar|2020-01-01| +---+------+----------+