[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]
scala> val df = Seq(
| ("01", "Jack", "2020-06-05"),
| ("02", "Tom", "2020-01-01"),
| ("03", "Mike", "2020-09-01"),
| ("04", "Tina", "2020-09-01"),
| ("05", "Alex", "2020-06-10"),
| ("06", "Bob", "2020-01-01"),
| ("07", "David", "2020-09-01"),
| ("08", "Ben", "2020-09-01"),
| ("09", "Allen", "2020-06-05"),
| ("10", "Caesar","2020-01-01")
| ).toDF("id", "name", "entrytime")
df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field]
// 獲取最大入職時間
scala> df.select(max($"entrytime")).show
+--------------+
|max(entrytime)|
+--------------+
| 2020-09-01|
+--------------+
// 獲取最小入職時間
scala> df.select(min($"entrytime")).show
+--------------+
|min(entrytime)|
+--------------+
| 2020-01-01|
+--------------+
// 統計字段姓名的記錄數
scala> df.select("name").count
res2: Long = 10
// 統計字段姓名中含有"A"的記錄數
scala> df.select("name").filter($"name".contains("A")).count
res3: Long = 2
// 過濾出姓名中含有"A"的記錄
scala> df.select("id", "name", "entrytime").filter($"name".contains("A")).show()
+---+-----+----------+
| id| name| entrytime|
+---+-----+----------+
| 05| Alex|2020-06-10|
| 09|Allen|2020-06-05|
+---+-----+----------+
// 按入職時間正序排序
scala> df.select(col("*")).orderBy("entrytime").show
+---+------+----------+
| id| name| entrytime|
+---+------+----------+
| 06| Bob|2020-01-01|
| 10|Caesar|2020-01-01|
| 02| Tom|2020-01-01|
| 01| Jack|2020-06-05|
| 09| Allen|2020-06-05|
| 05| Alex|2020-06-10|
| 07| David|2020-09-01|
| 08| Ben|2020-09-01|
| 03| Mike|2020-09-01|
| 04| Tina|2020-09-01|
+---+------+----------+
// 按入職時間倒序排序
scala> df.select(col("*")).orderBy($"entrytime".desc).show
+---+------+----------+
| id| name| entrytime|
+---+------+----------+
| 04| Tina|2020-09-01|
| 03| Mike|2020-09-01|
| 07| David|2020-09-01|
| 08| Ben|2020-09-01|
| 05| Alex|2020-06-10|
| 01| Jack|2020-06-05|
| 09| Allen|2020-06-05|
| 02| Tom|2020-01-01|
| 06| Bob|2020-01-01|
| 10|Caesar|2020-01-01|
+---+------+----------+