[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
[root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> val arr = Array(("Jack", "20", "M"), ("Jack", "30", "M"), ("Judy", "20", "F"), ("Jack", "20", "M"))
arr: Array[(String, String, String)] = Array((Jack,20,M), (Jack,30,M), (Judy,20,F), (Jack,20,M))
scala> val df = sc.parallelize(arr).toDF("name", "age", "gender")
df: org.apache.spark.sql.DataFrame = [name: string, age: string ... 1 more field]
scala> df.show(false)
+----+---+------+
|name|age|gender|
+----+---+------+
|Jack|20 |M |
|Jack|30 |M |
|Judy|20 |F |
|Jack|20 |M |
+----+---+------+
scala> df.groupBy('name).agg(countDistinct('age, 'gender) as 'distinctAgeAndGender).show(false)
+----+--------------------+
|name|distinctAgeAndGender|
+----+--------------------+
|Jack|2 |
|Judy|1 |
+----+--------------------+
scala> df.groupBy("name").agg(countDistinct("age", "gender") as "distinctAgeAndGender").show(false)
+----+--------------------+
|name|distinctAgeAndGender|
+----+--------------------+
|Jack|2 |
|Judy|1 |
+----+--------------------+
scala> df.groupBy("name").agg(countDistinct("age") as "distinctAge", current_timestamp as "option_timestamp").show(false)
+----+-----------+-----------------------+
|name|distinctAge|option_timestamp |
+----+-----------+-----------------------+
|Jack|2 |2020-09-14 12:30:51.566|
|Judy|1 |2020-09-14 12:30:51.566|
+----+-----------+-----------------------+