[Python3爬蟲從入門到精通]2.淘寶信息定向爬蟲實例分析


 1 import requests
 2 import re
 3 
 4 
 5 def getHTMLText(url):
 6     """
 7     headers = {
 8         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
 9     }
10     """
11     try:
12         #print(url)
13         r = requests.get(url, timeout=30)
14         r.raise_for_status() #不小心寫成r.raise_forstatus一直沒結果,以為需要添加heades才行呢,醉了醉了
15         #print(r.status_code)
16         r.encoding = r.apparent_encoding
17         return r.text
18     except:
19         return ""
20 
21 
22 def parsePage(ilt, html):
23     try:
24         #plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
25         #tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
26         plt = re.findall(r'"view_price":"[\d\.]*"', html)
27         # \d     數字:[0-9]
28         # *      匹配前一字符0到無數次
29         # []     對應位置可以是字符集中任意字符
30         tlt = re.findall(r'"raw_title":".*?"', html)
31         # .      匹配除換行符“/n”外的字符
32         # *?     數量詞后跟?變為非貪婪模式,在""內非貪婪匹配,去掉后一個引號匹配為空 "view_price":"
33         for i in range(len(plt)):
34             # eval()函數 list,tuple,dict和string相互轉化
35             price = eval(plt[i].split(':')[1])
36             title = eval(tlt[i].split(':')[1])
37             ilt.append([price, title])
38     except:
39         print('')
40 
41 
42 def printGoodsList(ilt):
43     tplt = '{:4}\t{:8}\t{:16}' #格式化輸出
44     print(tplt.format('序號', '價格', '商品名稱'))
45     count = 0
46     for g in ilt:
47         count = count + 1
48         print(tplt.format(count, g[0], g[1]))
49 
50 
51 def main():
52     goods = '書包' #搜索內容
53     depth = 2 # 翻頁深度
54     start_url = 'https://s.taobao.com/search?q=' + goods
55     infoList = []
56     for i in range(depth):
57         try:
58             url = start_url + '&s' + str(44 * i)
59             html = getHTMLText(url)
60             parsePage(infoList, html)
61         except:
62             continue
63     printGoodsList(infoList)
64 
65 
66 main()

50天后終於更了下一篇,繼續努力!


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM