1 import requests 2 import re 3 4 5 def getHTMLText(url): 6 """ 7 headers = { 8 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 9 } 10 """ 11 try: 12 #print(url) 13 r = requests.get(url, timeout=30) 14 r.raise_for_status() #不小心写成r.raise_forstatus一直没结果,以为需要添加heades才行呢,醉了醉了 15 #print(r.status_code) 16 r.encoding = r.apparent_encoding 17 return r.text 18 except: 19 return "" 20 21 22 def parsePage(ilt, html): 23 try: 24 #plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) 25 #tlt = re.findall(r'\"raw_title\"\:\".*?\"', html) 26 plt = re.findall(r'"view_price":"[\d\.]*"', html) 27 # \d 数字:[0-9] 28 # * 匹配前一字符0到无数次 29 # [] 对应位置可以是字符集中任意字符 30 tlt = re.findall(r'"raw_title":".*?"', html) 31 # . 匹配除换行符“/n”外的字符 32 # *? 数量词后跟?变为非贪婪模式,在""内非贪婪匹配,去掉后一个引号匹配为空 "view_price":" 33 for i in range(len(plt)): 34 # eval()函数 list,tuple,dict和string相互转化 35 price = eval(plt[i].split(':')[1]) 36 title = eval(tlt[i].split(':')[1]) 37 ilt.append([price, title]) 38 except: 39 print('') 40 41 42 def printGoodsList(ilt): 43 tplt = '{:4}\t{:8}\t{:16}' #格式化输出 44 print(tplt.format('序号', '价格', '商品名称')) 45 count = 0 46 for g in ilt: 47 count = count + 1 48 print(tplt.format(count, g[0], g[1])) 49 50 51 def main(): 52 goods = '书包' #搜索内容 53 depth = 2 # 翻页深度 54 start_url = 'https://s.taobao.com/search?q=' + goods 55 infoList = [] 56 for i in range(depth): 57 try: 58 url = start_url + '&s' + str(44 * i) 59 html = getHTMLText(url) 60 parsePage(infoList, html) 61 except: 62 continue 63 printGoodsList(infoList) 64 65 66 main()
50天后终于更了下一篇,继续努力!