spark-sql分组去重总数统计uv


SparkConf sparkConf = new SparkConf();
        sparkConf
                .setAppName("Internal_Func")
                .setMaster("local");

        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        SQLContext sqlContext = new SQLContext(javaSparkContext);

        List<String> list = new ArrayList<String>();
        list.add("1,1");
        list.add("2,11");
        list.add("2,111");
        list.add("2,111");
        list.add("3,1111");
        list.add("3,11111");

        JavaRDD<String> rdd_str = javaSparkContext.parallelize(list, 5);

        JavaRDD<Row> rdd_row = rdd_str.map(new Function<String, Row>() {
            @Override
            public Row call(String v1) throws Exception {
                String ary[] = v1.split(",");
                return RowFactory.create(ary[0], Long.parseLong(ary[1]));
            }
        });

        List<StructField> fieldList = new ArrayList<StructField>();
        fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
        fieldList.add(DataTypes.createStructField("sc", DataTypes.LongType, true));
        StructType tmp = DataTypes.createStructType(fieldList);

        DataFrame df = sqlContext.createDataFrame(rdd_row, tmp);
        df.registerTempTable("tmp_sc");

        DataFrame df_agg = sqlContext.sql("select name,count(distinct(sc)) from tmp_sc group by name");//去重后分组求和统计

        df_agg.show();

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM