1.
原始表
CREATE TABLE ml_100k (userid INT, movieid INT, rating INT, unixtime STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
數據清洗后的表
CREATE TABLE ml_100k2 (userid INT, movieid INT, rating INT, weekday int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
3.數據導入
LOAD DATA LOCAL INPATH '/home/centos/ml-100k/u.data' into table ml_100k;


4.腳本編寫和腳本加載
clean_ml_100k.py
import sys import datetime for line in sys.stdin: #接收輸入 line = line.strip() userid, movieid, rating, unixtime = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() #轉化unixtime時間戳為日期時間,獲取對應的星期值 print('\t'.join([userid, movieid, rating, str(weekday)])) # 輸出清洗后的數據
然后 hdfs dfs -put
add file /home/centos/clean_ml_100k.py;
5.數據清洗+轉儲
INSERT OVERWRITE TABLE ml_100k2 SELECT TRANSFORM (userid, movieid, rating, unixtime) --輸入值(基表) USING 'python clean_ml_100k.py' --使用腳本清洗 AS (userid, movieid, rating, weekday) --輸出值(子表) FROM ml_100k;
然后我就失敗了

