Hive-SQL練習
通過hive統計一篇文章中WordCount
1. hive創建內部表wc
create table wc ( lie string) row format delimited fields terminated by '\n';
2. 向wc表中導入hdfs文章的內容
load data inpath '/1.txt' overwrite into table wc;
3. 統計詞頻
select word,count(1) as word_count from (select explode(split(lie2, " ")) as word //將文章的每一行(表中的每條記錄),按照空格切割為每個元素 from (select regexp_replace(lie,'[^a-zA-Z0-9]',' ') as lie2 //過濾掉特殊字符,只保留a-zA-Z0-9,其余全部替換為空格 from wc ) t1 ) t where word REGEXP '[a-zA-Z0-9]' //過濾掉空格 group by word order by word_count desc //按照詞頻排倒序(desc降序,默認asc升序) limit 10;
4. 知識點
regexp_replace(source,reg規則,desc) //字段source內容滿足reg規則就替換為desc指定的內容
比如:regexp_replace(123,'^[0-9]',123) //返回12323