[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/ [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager [root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/ [root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore & [root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/ [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh scala> import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._ scala> val arr = Array(("Jack", "20", "M"), ("Jack", "30", "M"), ("Judy", "20", "F"), ("Jack", "20", "M")) arr: Array[(String, String, String)] = Array((Jack,20,M), (Jack,30,M), (Judy,20,F), (Jack,20,M)) scala> val df = sc.parallelize(arr).toDF("name", "age", "gender") df: org.apache.spark.sql.DataFrame = [name: string, age: string ... 1 more field] scala> df.show(false) +----+---+------+ |name|age|gender| +----+---+------+ |Jack|20 |M | |Jack|30 |M | |Judy|20 |F | |Jack|20 |M | +----+---+------+ scala> df.groupBy('name).agg(countDistinct('age, 'gender) as 'distinctAgeAndGender).show(false) +----+--------------------+ |name|distinctAgeAndGender| +----+--------------------+ |Jack|2 | |Judy|1 | +----+--------------------+ scala> df.groupBy("name").agg(countDistinct("age", "gender") as "distinctAgeAndGender").show(false) +----+--------------------+ |name|distinctAgeAndGender| +----+--------------------+ |Jack|2 | |Judy|1 | +----+--------------------+ scala> df.groupBy("name").agg(countDistinct("age") as "distinctAge", current_timestamp as "option_timestamp").show(false) +----+-----------+-----------------------+ |name|distinctAge|option_timestamp | +----+-----------+-----------------------+ |Jack|2 |2020-09-14 12:30:51.566| |Judy|1 |2020-09-14 12:30:51.566| +----+-----------+-----------------------+