一、測試要求:
1、 數據采集(要求至少爬取三千條記錄,時間跨度超過一星期):(10分)
要求Python 編寫程序爬取京東手機的評論數據,生成Json形式的數據文件。
-
- python代碼(一次只是爬取單個商品的用戶評論、本次爬取了三個產品的用戶評論):
- 需要修改的參數:agents、url、cookie、phone_id
- 爬取數據的相關格式請自行修改
import urllib.request import json import random import time as time0 import re, os import pandas as pd # 設置代理 agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def product_reviews(product_id=None, p=0, maxPage=99): root_dir = '暢想_詳細字典' # 判斷之前是否爬取過這個型號手機的評論(一種型號的手機,顏色和內存不同,但評論共享) os.makedirs(root_dir, exist_ok=True) phone_list = os.listdir(root_dir) phone_txt = str(product_id) + '.txt' if phone_txt in phone_list: print(product_id) return [] # 對每一頁循環爬取 # "maxPage": 45 k_head = 0 while p < maxPage: # 所有品牌評論 # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&rid=0&fold=1' # 只看當前商品的評論 # url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&fold=1' # url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&fold=1' url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&fold=1' url = url.format(product_id, p, maxPage) # print(url) # 仿造請求頭,騙過瀏覽器 # cookie可以查找自己瀏覽器中的cookie,直接復制過來 # cookie = '__jdu=16086454389902142527500; shshshfpa=428bc42a-e70a-655c-93f6-e3691985be43-1608645441; areaId=5; PCSYCityID=CN_130000_0_0; shshshfpb=zZdUWw6j4E+CLU7Oc2T9TPw==; jwotest_product=99; ipLoc-djd=5-142-42547-54561; unpl=JF8EAMlnNSttWR5cBhkFSREXSQ8HW10JS0RQam5RV1hcSlwMGFYfF0d7XlVdXhRKFR9vZRRUWlNKUA4aACsSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAhwTGUxYUFtfAUMfAmxnAFdZW01QBBoyKxUge21cX18PQxEzblcEZB8MF1YNEgMdEV1LWlVXWg1PEgFmbw1VXlhOVwEYBB8TEXtcZF0; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_0d9236c263cb4101bb48e2450982e54f|1647159858849; token=073f0cf5421b21eb907cb3d463424c0d,2,915089; __tk=yLrdvVbqNid3IaWAIDreOMTKOBrtHJ5QyghVqEbPzizsHLThyLr4OB5TIgKtwMblyg33rRbC,2,915089; shshshfp=3f1b5dd4917b58fe63ba3cdc71c35d33; shshshsID=2e60157b7de85966e3acbd0dc0063568_1_1647161877700; __jda=122270672.16086454389902142527500.1608645438.1647158698.1647161878.18; __jdb=122270672.1.16086454389902142527500|18.1647161878; __jdc=122270672; ip_cityCode=142; JSESSIONID=77C90B3806506F4FE8DA83EFC6A843FB.s1; 3AB9D23F7A4B3C9B=PYWTVSEEI7W7KUBCKF6CBWAXHNRJIWPF2VDYXRDH7USOJ4XIOOKLRQ4Z5JEMWWSSIBFD6MGMFEV5I2UWS2R6ZA6STM' # cookie = '__jdu=16086454389902142527500; shshshfpa=428bc42a-e70a-655c-93f6-e3691985be43-1608645441; areaId=5; PCSYCityID=CN_130000_0_0; shshshfpb=zZdUWw6j4E+CLU7Oc2T9TPw==; jwotest_product=99; ipLoc-djd=5-142-42547-54561; unpl=JF8EAMlnNSttWR5cBhkFSREXSQ8HW10JS0RQam5RV1hcSlwMGFYfF0d7XlVdXhRKFR9vZRRUWlNKUA4aACsSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAhwTGUxYUFtfAUMfAmxnAFdZW01QBBoyKxUge21cX18PQxEzblcEZB8MF1YNEgMdEV1LWlVXWg1PEgFmbw1VXlhOVwEYBB8TEXtcZF0; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_0d9236c263cb4101bb48e2450982e54f|1647163488743; __jda=122270672.16086454389902142527500.1608645438.1647158698.1647161878.18; __jdc=122270672; token=38fe0b7e85bcc8e668e7ee86b94f7374,2,915090; __tk=TLY2iIi5TUvviAa4TUqFiDfJinTvVIi5Skl5VIVKVUl2VnSEiLe2iG,2,915090; shshshfp=3f1b5dd4917b58fe63ba3cdc71c35d33; ip_cityCode=142; shshshsID=2e60157b7de85966e3acbd0dc0063568_7_1647163524505; __jdb=122270672.7.16086454389902142527500|18.1647161878; 3AB9D23F7A4B3C9B=PYWTVSEEI7W7KUBCKF6CBWAXHNRJIWPF2VDYXRDH7USOJ4XIOOKLRQ4Z5JEMWWSSIBFD6MGMFEV5I2UWS2R6ZA6STM; JSESSIONID=ED19674156BF5FC641C366B7E0FFAAD2.s1' cookie = '__jdu=16086454389902142527500; shshshfpa=428bc42a-e70a-655c-93f6-e3691985be43-1608645441; areaId=5; PCSYCityID=CN_130000_0_0; shshshfpb=zZdUWw6j4E+CLU7Oc2T9TPw==; jwotest_product=99; ipLoc-djd=5-142-42547-54561; unpl=JF8EAMlnNSttWR5cBhkFSREXSQ8HW10JS0RQam5RV1hcSlwMGFYfF0d7XlVdXhRKFR9vZRRUWlNKUA4aACsSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAhwTGUxYUFtfAUMfAmxnAFdZW01QBBoyKxUge21cX18PQxEzblcEZB8MF1YNEgMdEV1LWlVXWg1PEgFmbw1VXlhOVwEYBB8TEXtcZF0; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_0d9236c263cb4101bb48e2450982e54f|1647163488743; __jdc=122270672; __jda=122270672.16086454389902142527500.1608645438.1647158698.1647161878.18; shshshfp=3f1b5dd4917b58fe63ba3cdc71c35d33; ip_cityCode=142; token=a50c44639af4e4879d72140e2e4b8af5,2,915091; __tk=kYj5AUeEjDftAUgyAVAqkVa1BVfqBVAsBVAqAVnFAz4,2,915091; shshshsID=2e60157b7de85966e3acbd0dc0063568_8_1647164950694; __jdb=122270672.8.16086454389902142527500|18.1647161878; 3AB9D23F7A4B3C9B=PYWTVSEEI7W7KUBCKF6CBWAXHNRJIWPF2VDYXRDH7USOJ4XIOOKLRQ4Z5JEMWWSSIBFD6MGMFEV5I2UWS2R6ZA6STM; JSESSIONID=D3446CA1DE4A705EFCCCCC073D47B42D.s1' headers = { 'User-Agent': ''.join(random.sample(agents, 1)), 'Referer': 'https://item.jd.com/', 'Cookie': cookie } # 發起請求 request = urllib.request.Request(url=url, headers=headers) time0.sleep(2.5) # 得到響應ti'm try: content = urllib.request.urlopen(request).read().decode('gbk') except: print('第%d頁評論代碼出錯' % p) p = p + 1 continue # 去掉多余得到json格式 content = content.strip('fetchJSON_comment98vv995();') # 評論的最大頁數 try: maxPage = int(re.findall('"maxPage":(.*?),"', content, re.S)[0]) except: pass try: obj = json.loads(content) except: print('信號不好,再次嘗試!') print([content]) print(url) continue comments = obj['comments'] # 產品評論總結 productCommentSummary = obj['productCommentSummary'] dict_pars_info = {} # 平均分 # dict_pars_info['平均分'] = str(productCommentSummary['averageScore']) # 好評率 dict_pars_info['好評率'] = str(productCommentSummary['goodRate']) # 當前總評論數 dict_pars_info['中評率'] = str(productCommentSummary['generalRate']) # 默認評論數 dict_pars_info['差評率'] = str(productCommentSummary['poorRate']) # 好評、中評、差評 dict_pars_info['好評數'] = str(productCommentSummary['score5Count']) dict_pars_info['中評數'] = str(productCommentSummary['score3Count']) dict_pars_info['差評數'] = str(productCommentSummary['score1Count']) if len(comments) > 0: # print(comments) for comment in comments: # print(comment) id = comment['id'] guid = comment['guid'] content = comment['content'] creationTime = comment['creationTime'] score = comment['score'] nickname = comment['nickname'] plusAvailable = comment['plusAvailable'] days = comment['days'] try: mobileVersion = comment['mobileVersion'] except: mobileVersion = '' item = { 'id': id, 'guid': guid, 'content': content, 'creationTime': creationTime, 'score': score, 'nickname': nickname, 'plusAvailable': plusAvailable, 'mobileVersion': mobileVersion, 'days': days, } item.update(dict_pars_info) # print(item) string = str(item) # 1.保存為csv格式 item_dataframe = pd.DataFrame([item]) # print(item_dataframe) if k_head == 0: item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='w', header=True, index=False, encoding='gbk') k_head += 1 else: item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='a', header=False, index=False, encoding='gbk') # 2.保存成txt fp = open(root_dir + '/%d.txt' % product_id, 'a', encoding='gbk') fp.write(string + '\n') fp.close() print('%s-page---finish(%s/%s)' % (p, p, maxPage)) else: return [] p = p + 1 if __name__ == '__main__': phone_id = 100015154663 # phone_id = 100026796994 # phone_id = 100016944073 product_reviews(product_id=phone_id)
運行截圖:
2、數據預處理:要求使用MapReduce或者kettle實現源數據的預處理,對大量的Json文件,進行清洗,以得到結構化的文本文件。(本人使用kettle進行的數據清洗)
3、 數據統計:生成Hive用戶評論數據:
(1) 在Hive創建一張表,用於存放清洗后的數據,表名為pinglun,(創建數據表SQL語句),創建成功導入數據截圖:
sql語句:
create table pinglun(id string,commentcount int,goodcount int,generalcount int,poorcount int,goodrateshow float,generalrateshow float,poorrateshow float,guid string,content string,creationTime string,score int,nickname string,plusAvailable string,mobileVersion string,days int)row format delimited fields terminated by ',';
需求1:分析用戶使用移動端購買還是PC端購買,及移動端和PC端的用戶比例,生成ismobilehive表,存儲統計結果;創建數據表SQL語句,創建成功導入數據截圖
sql語句:(所出結果均使用代碼中所爬取數據)
create table ismobile(buylevel string,buynum int);
select count(*) from pinglun where mobileVersion='';
insert into ismobile(buylevel,buynum) values('0',136);
select count(*) from pinglun where mobileVersion!='';
insert into ismobile(buylevel,buynum) values('1',1853);
需求2:分析用戶評論周期(收到貨后,一般多久進行評論),生成dayssql表,存儲統計結果;創建數據表SQL語句,創建成功導入數據截圖
sql語句:create table dayssql as select days,count(*) from pinglun group BY days;
需求3:分析會員級別(判斷購買此商品的用戶級別),生成userlevelname_out表,存儲統計結果;創建數據表SQL語句,創建成功導入數據截圖
sql語句:create table userlevelname_out as plusAvailable,count(*) from pinglun group BY plusAvailable;
需求4:分析每天評論量,生成creationtime_out表,存儲統計結果;創建數據表SQL語句,創建成功導入數據截圖
sql語句:create table creationtime_out as select to_date(creationtime),count(*) from pinglun group BY to_date(creationtime);
需求5:日期格式標准化后數據表前后對照截圖 功能為:去掉評論時間的時分秒,只保留年月日
4、 利用Sqoop進行數據遷移至Mysql數據庫:(5分)
五個表導入mysql數據庫中五個表截圖。
5、 數據可視化:利用JavaWeb+Echarts完成數據圖表展示過程(20分)
需求1:可視化展示截圖
需求2:可視化展示截圖
需求3:可視化展示截圖
需求4:可視化展示截圖
6、 中文分詞實現用戶評價分析。(20分)
(1) 本節通過對商品評論表中的差評數據,進行分析,篩選用戶差評點,以知己知彼。(篩選差評數據集截圖)
Sql:create table poorpinglun as select * from pinglun where score < 4;
(2) 利用 python 結巴分詞實現用戶評價信息中的中文分詞及詞頻統計;(分詞后截圖)
# -*- coding: utf-8 -*- import pandas as pd import pymysql import jieba def getdata(): dbconn=pymysql.connect(host="127.0.0.1", database="sparktest", user="root", password="lin0613", port=3306, charset='utf8') #sql語句 sqlcmd="select content from poorpinglun limit 177" #利用pandas 模塊導入mysql數據 titles=pd.read_sql(sqlcmd,dbconn) keywords ="" print(titles.values) for i in range(len(titles)): str =(",").join(titles.values[i]) word_list = jieba.cut(str) keywords = list(word_list) count = 0 for count in range(len(keywords)): if checkword(keywords[count]): flag = checkre(pymysql.connect(host="127.0.0.1", database="sparktest", user="root", password="lin0613", port=3306, charset='utf8'), keywords[count]) if flag: save_keywords(pymysql.connect(host="127.0.0.1", database="sparktest", user="root", password="lin0613", port=3306, charset='utf8'), keywords[count]) print(keywords[count]) else: updatenum(pymysql.connect(host="127.0.0.1", database="sparktest", user="root", password="lin0613", port=3306, charset='utf8'), keywords[count]) else: print("未知詞語") def checkword(word): invalid_words = [',', '.', ',', '。', ':', '“', '”', '"', '?', '?', '《', '》', '(', '{', ')', '}', '!', '%', '℃', '¥', '#'] if word.lower() in invalid_words: return False else: return True def save_keywords(db, keyword): # 使用cursor()方法獲取操作游標 cursor = db.cursor() # SQL 插入語句 sql = "INSERT INTO key_pinglun1(keyword,num) VALUES ('%s',1)" % (keyword) try: # 執行sql語句 cursor.execute(sql) # 執行sql語句 print("true") db.commit() except: print("數據插入失敗") # 發生錯誤時回滾 db.rollback() # 關閉數據庫連接 db.close() def updatenum(db,keyword): # 使用cursor()方法獲取操作游標 cursor = db.cursor() # SQL 插入語句 sql = "update key_pinglun1 set num=num+1 where keyword = '%s' " % keyword try: # 執行sql語句 cursor.execute(sql) # 執行sql語句 db.commit() except: print("數據更新失敗") # 發生錯誤時回滾 db.rollback() # 關閉數據庫連接 db.close() def checkre(db, keyword): # 使用cursor()方法獲取操作游標 cursor = db.cursor() ket = [] # SQL 插入語句 ket = [] sql = "select keyword from key_pinglun1 where keyword = '%s'" % keyword try: # 執行sql語句 cursor.execute(sql) ket = list(cursor.fetchall()) db.commit() except: print("查詢數據失敗") # 發生錯誤時回滾 db.rollback() # 關閉數據庫連接 db.close() if ket: return False else: return True if __name__ == '__main__': getdata()
(3)在 hive 中新建詞頻統計表並加載分詞數據;
④柱狀圖可視化展示用戶差評的統計前十類。
⑤用詞雲圖可視化展示用戶差評分詞。
7、利用Spark進行實時數據分析。(20分)
本實驗以京東商品評論為目標網站,架構采用爬蟲+Flume+Kafka+Spark Streaming+Mysql,實現數據動態實時的采集、分析、展示數據。