import requests import re from bs4 import BeautifulSoup import bs4 ''' 數據線起始頁https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306 數據線第二頁https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44 數據線第三頁https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88 ''' def get_html_text(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except : return '該網頁請求連接失敗' #通過腳本語言編寫的html代碼,不是完整的html語言,直接搜索相對比較簡單 #正則表達式原生字符串你還沒有理解,最小匹配原則 視頻時間11:20 def parse_page(list_info,html): try: list_price=re.findall(r'"view_price":"[\d.]*"',html) list_title=re.findall(r'"raw_title":".*?"',html) list_location=re.findall(r'"item_loc":".*?"',html) #list_num_payment=re.findall(r'"view_sales":"u"',html) for i in range(len(list_price)): price=eval(list_price[i].split(':')[1]) title=eval(list_title[i].split(':')[1]) location=eval(list_location[i].split(':')[1]) #num_payment=eval(list_num_payment.split(':')[1]) #list_info.append([price,num_payment,location,title]) list_info.append([price,location,title]) except : print('解析網頁出現異常') def print_goods_info(list_info): #tplt='{:4}\t{:8}\t{:8}\t{:12}\t{:20}\t' tplt='{:4}\t{:8}\t{:12}\t{:20}\t' #print(tplt.format('序號','商品價格','付款人數','發貨地址','商品名稱')) print(tplt.format('序號','商品價格','發貨地址','商品名稱')) count=0 for goods in list_info: count+=1 #print(tplt.format(count,goods[0],goods[1],goods[2],goods[3])) print(tplt.format(count,goods[0],goods[1],goods[2])) if __name__ == '__main__': goods='書包' depth=2 start_url='https://s.taobao.com/search?q='+goods list_info=[] for i in range(depth): try: url=start_url+'&s='+str(44*i) html=get_html_text(url) parse_page(list_info,html) except: continue #如果某一個頁面出現了問題,則會跳過該頁面的解析,而不會影響程序的整體運行 print_goods_info(list_info)
