1.re实现

1 import requests 2 from requests.exceptions import RequestException 3 import re,json 4 import xlwt,xlrd 5 6 # 数据 7 DATA = [] 8 KEYWORD = 'python' 9 HEADERS = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'\ 10 '/63.0.3239.132 Safari/537.36'} 11 MAX_PAGE = 10 12 13 14 def get_target(data_list): 15 for item in data_list: 16 temp = { 17 'title': item['title'], 18 'price': item['view_price'], 19 'sales': item['view_sales'], 20 'isTmall': '否' if float(item['view_fee']) else '是', 21 'area': item['item_loc'], 22 'name': item['nick'], 23 'url': item['detail_url'] 24 } 25 DATA.append(temp) 26 return True 27 28 29 # 发送http请求,获取网页源码 30 def get_html(url,*args): 31 try: 32 if not args: 33 response = requests.get(url,headers=HEADERS) 34 global COOKIES 35 COOKIES = response.cookies # 获取cookie 36 else: 37 response = requests.get(url,headers=HEADERS,cookies=COOKIES) 38 39 response.encoding = response.apparent_encoding 40 return response.text 41 except RequestException: 42 print('请求源码出错!') 43 44 # 解析源码,得到目标信息 45 def parse_html(html,*args): 46 if not args: 47 pattern = re.compile(r'g_page_config = (.*?)g_srp_loadCss',re.S) 48 # 去掉末尾的';' 49 result = re.findall(pattern, html)[0].strip()[:-1] 50 # 格式化json,可以用json在线解析工具查看结构 51 content = json.loads(result) 52 data_list = content['mods']['itemlist']['data']['auctions'] 53 else: 54 pattern = re.compile(r'{.*}',re.S) 55 result = re.findall(pattern,html)[0] 56 content = json.loads(result) 57 data_list = content['API.CustomizedApi']['itemlist']['auctions'] 58 59 get_target(data_list) 60 61 62 def save_to_excel(): 63 f_name = '淘宝%s数据'%KEYWORD 64 book = xlwt.Workbook(encoding='utf-8',style_compression=0) 65 sheet = book.add_sheet(f_name) 66 sheet.write(0, 0, 'title') 67 sheet.write(0, 1, 'price') 68 sheet.write(0, 2, 'sales') 69 sheet.write(0, 3, 'isTmall') 70 sheet.write(0, 4, 'area') 71 sheet.write(0, 5, 'name') 72 sheet.write(0, 6, 'url') 73 for i in range(len(DATA)): 74 sheet.write(i+1, 0, DATA[i]['title']) 75 sheet.write(i+1, 1, DATA[i]['price']) 76 sheet.write(i+1, 2, DATA[i]['sales']) 77 sheet.write(i+1, 3, DATA[i]['isTmall']) 78 sheet.write(i+1, 4, DATA[i]['area']) 79 sheet.write(i+1, 5, DATA[i]['name']) 80 sheet.write(i+1, 6, DATA[i]['url']) 81 book.save('淘宝%s数据.xls'%KEYWORD) 82 83 84 85 def main(): 86 for offset in range(MAX_PAGE): 87 # 首页有12条异步加载的数据 api? 88 if offset == 0: 89 url1 = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44) 90 html = get_html(url1) 91 contents = parse_html(html) 92 93 url2 = 'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&' \ 94 'stats_click=search_radio_all:1&q={}'.format(KEYWORD) 95 html = get_html(url2,2) 96 contents = parse_html(html,2) 97 else: 98 url = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44) 99 html = get_html(url) 100 contents = parse_html(html) 101 102 save_to_excel() 103 print(len(DATA)) 104 105 if __name__ == '__main__': 106 main()