Python 爬取大眾點評店鋪評論


  1 import parsel
  2 import pymysql
  3 from lxml import etree
  4 import re
  5 import requests
  6 def download_data(url,cookie):
  7     '''
  8     獲取加密網頁源碼
  9     獲取加密文件
 10     :return:
 11     '''
 12     headers = {
 13         "Cookie": cookie,
 14         "Referer": "http://www.dianping.com/",
 15         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
 16     }
 17     '''
 18     獲取原始網頁
 19     '''
 20     ret = requests.get(url=url, headers=headers).text
 21     with open('01 原始網頁_加密.html', 'w', encoding='utf-8') as f:
 22         f.write(ret)
 23 
 24     '''
 25     獲取css文件
 26     '''
 27     css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
 28     css_url = 'https:' + css_url[0]
 29     css_response = requests.get(css_url).text
 30     with open('02 css樣式.css', 'w', encoding='utf-8') as f:
 31         f.write(css_response)
 32 
 33     '''
 34     獲取svg對照表
 35     '''
 36     svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css_response)
 37     for svg_url in svg_urls:
 38         name, url = svg_url
 39         svg_url = 'https:' + url
 40         svg_response = requests.get(svg_url).text
 41         with open(F'03 svg對照表{name}.svg', 'w', encoding='utf-8') as f:
 42             f.write(svg_response)
 43 def crack_data():
 44     '''
 45     解密數據,破解svg對應關系
 46     :return:
 47     '''
 48     with open('03 svg對照表zpd.svg', 'r', encoding='utf-8') as f:#文件名稱根據獲取到的svg文件更換
 49         svg_html = f.read()
 50     sel = parsel.Selector(svg_html)
 51     texts = sel.css('textPath')
 52     paths = sel.css('path')
 53     path_dict = {}
 54     for path in paths:
 55         path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
 56         # print(path.css('path::attr(id)').get())
 57         # print(path.css('path::attr(d)').get().split(' ')[1])
 58     count = 1
 59     zpd_svg_dict = {}  # y坐標和字符串的聯系
 60     for text in texts:
 61         zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
 62         count += 1
 63     print(zpd_svg_dict)
 64 
 65     with open('02 css樣式.css', 'r', encoding='utf-8') as f:
 66         css_html = f.read()
 67 
 68     css_paths = re.findall(r'''
 69     \.(zpd.*?) {
 70         background: -(\d+)\.0px -(\d+)\.0px;
 71     \}
 72     ''', css_html) # 正則表達式條件根據css文件類標簽更換
 73     print(css_paths)
 74     last_map = {}
 75     for css_path in css_paths:
 76         css_name, x, y = css_path
 77         index = int(int(x) / 14)
 78         for i in zpd_svg_dict:
 79             if int(y) > int(i):
 80                 pass
 81             else:
 82                 last_map[css_name] = zpd_svg_dict[i][index]
 83                 break
 84     return last_map
 85 
 86 def decryption(last_map):
 87     '''
 88     返回破解后的html
 89     :param last_map:
 90     :return:
 91     '''
 92 
 93     with open('01 原始網頁_加密.html', 'r', encoding='utf-8') as f:
 94         ret = f.read()
 95     svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
 96     for svg in svg_list:
 97         print(svg, last_map[svg])
 98         ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
 99     return ret
100 def write_data(ret):
101     '''
102     獲取評論數據並寫入數據庫
103 
104     :param ret:
105     :return:
106     '''
107     # 用不到的div標簽去掉  並不是全部都有這個標簽 影響代碼編寫
108     ret = ret.replace(' <div class="richtitle">消費后評價</div>', '')
109     # ret = ret.replace(div,'')
110     # print(ret)
111     etre = etree.HTML(ret)
112     li_list = etre.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li')
113 
114     # 初始化數據庫
115     db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review',
116                          charset='utf8mb4')
117     cursor = db.cursor()
118     count = 0
119     for li in li_list:
120         name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip()
121         score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip()
122         time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip()
123         shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip()
124         comment = ','.join([i.replace('\n', '').strip() for i in li.xpath('./div/div[4]/text()')])
125         count += 1
126         print(name, score, time, shop_name, comment)
127         # 寫入數據庫
128         sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)'
129         cursor.execute(sql, (name, score, time, shop_name, comment))
130         db.commit()
131     # 關閉連接
132     db.close()
133 
134 
135 if __name__ == '__main__':
136     #cookie 不定時更換
137     cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532"
138     url = 'http://www.dianping.com/shop/130096343/review_all' #這是一個商家的評論  可以更換
139     try:
140         download_data(url,cookie)
141     except Exception:
142         print('出現驗證碼驗證')#訪問過多會出現驗證碼 目前沒有破解
143     map_dict = {}
144     try:
145         map_dict = crack_data()
146     except Exception:
147         print('css類屬性發生變化')
148     ret = decryption(map_dict)
149     write_data(ret)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM