豆瓣上有圖書的排行榜,所以這次寫了一個豆瓣的爬蟲。
首先是分析排行榜的url
根據這個可以很容易的知道不同圖書的排行榜就是在網站后面加上/tag/【類別】,所以我們首先要獲得圖書的類別信息。
這里可以將讀書首頁的熱門標簽給爬下來。
爬取標簽內容並不難,代碼如下:
1 def getLabel(url): #獲得熱門標簽 2 html = getHTMLText(url) 3 soup = BeautifulSoup(html, 'html.parser') 4 a = soup.find_all('a') 5 label_list = [] 6 for i in a: 7 try: 8 href = i.attrs['href'] 9 match = re.search(r'/tag/.*', href) 10 if match and match[0][5]!='?': 11 label_list.append(match[0]) 12 except: 13 continue 14 return label_list
接下來是進入排行榜頁面進行信息爬取,
代碼如下:
1 def getBookInfo(): 2 label_list = getLabel('https://book.douban.com/') 3 label = get_label(label_list) 4 name = [] 5 author = [] 6 price = [] 7 score = [] 8 number = [] 9 for page in label_list[int(label)-1:int(label)]: 10 for i in range(2): 11 html = getHTMLText('https://book.douban.com' + page + '?start=' + str(i*20) + '&type=T') 12 soup = BeautifulSoup(html, 'html.parser') 13 book_list = soup.find_all('div', attrs={'class':'info'}) #找到書籍的信息列表 14 for book in book_list: 15 a = book.find_all('a',attrs={'title':re.compile('.*')})[0] #找到包含書籍名的a標簽 16 name.append(a.get('title')) #獲得標題屬性 17 18 pub = book.find_all('div', attrs={'class':'pub'})[0] 19 pub = pub.string.strip().replace('\n','') 20 author.append(re.findall(r'(.*?)/', pub)[0].strip()) 21 split_list = pub.split() #空格分割 22 for j in split_list: 23 match = re.search(r'\d.*\..*', j) #獲得價格信息 24 if match: 25 price.append(match[0]) 26 27 span = book.find_all('span', attrs={'class':'pl'})[0] #獲得評價人數所在標簽 28 span = span.string.strip().replace('\n','') 29 number.append(re.findall(r'\d+', span)[0]) #獲得人數 30 31 span = book.find_all('span', attrs={'class':'rating_nums'})[0] 32 score.append(span.string) 33 34 tplt = "{:3}\t{:15}\t{:15}\t{:10}\t{:4}\t{:7}" #規定輸出格式 35 print(tplt.format("序號", "書籍", "作者", "價格", "評分", "評價人數")) 36 l = len(name) 37 for count in range(l): 38 print(tplt.format(count+1, name[count],author[count],price[count],score[count],number[count]))
最終的總代碼為:
1 import requests 2 import re 3 from bs4 import BeautifulSoup 4 5 6 def getHTMLText(url): 7 try: 8 r = requests.get(url, timeout=30) 9 r.raise_for_status() 10 r.encoding = r.apparent_encoding 11 return r.text 12 except: 13 return "" 14 15 16 def getLabel(url): #獲得熱門標簽 17 html = getHTMLText(url) 18 soup = BeautifulSoup(html, 'html.parser') 19 a = soup.find_all('a') 20 label_list = [] 21 for i in a: 22 try: 23 href = i.attrs['href'] 24 match = re.search(r'/tag/.*', href) 25 if match and match[0][5]!='?': 26 label_list.append(match[0]) 27 except: 28 continue 29 return label_list 30 31 32 def get_label(label_list): 33 count = 1 34 for i in label_list: 35 print(str(count) + ': ' + label_list[count-1][5:] + '\t', end='') 36 count = count + 1 37 choose = input('\n\n請輸入你想查詢的圖書類別:') 38 while int(choose)<=0 or int(choose)>=count: 39 choose = input('\n請輸入正確的類別編號:') 40 return int(choose) 41 42 def getBookInfo(): 43 label_list = getLabel('https://book.douban.com/') 44 label = get_label(label_list) 45 name = [] 46 author = [] 47 price = [] 48 score = [] 49 number = [] 50 for page in label_list[int(label)-1:int(label)]: 51 for i in range(2): 52 html = getHTMLText('https://book.douban.com' + page + '?start=' + str(i*20) + '&type=T') 53 soup = BeautifulSoup(html, 'html.parser') 54 book_list = soup.find_all('div', attrs={'class':'info'}) #找到書籍的信息列表 55 for book in book_list: 56 a = book.find_all('a',attrs={'title':re.compile('.*')})[0] #找到包含書籍名的a標簽 57 name.append(a.get('title')) #獲得標題屬性 58 59 pub = book.find_all('div', attrs={'class':'pub'})[0] 60 pub = pub.string.strip().replace('\n','') 61 author.append(re.findall(r'(.*?)/', pub)[0].strip()) 62 split_list = pub.split() #空格分割 63 for j in split_list: 64 match = re.search(r'\d.*\..*', j) #獲得價格信息 65 if match: 66 price.append(match[0]) 67 68 span = book.find_all('span', attrs={'class':'pl'})[0] #獲得評價人數所在標簽 69 span = span.string.strip().replace('\n','') 70 number.append(re.findall(r'\d+', span)[0]) #獲得人數 71 72 span = book.find_all('span', attrs={'class':'rating_nums'})[0] 73 score.append(span.string) 74 75 tplt = "{:3}\t{:15}\t{:15}\t{:10}\t{:4}\t{:7}" #規定輸出格式 76 print(tplt.format("序號", "書籍", "作者", "價格", "評分", "評價人數")) 77 l = len(name) 78 for count in range(l): 79 print(tplt.format(count+1, name[count],author[count],price[count],score[count],number[count])) 80 81 82 83 if __name__ =='__main__': 84 print('豆瓣圖書綜合排序查詢\n') 85 getBookInfo() 86
最后的運行效果:
首先是類別表:
輸入圖書類別后就可以顯示圖書信息了:
我這里只爬取了兩頁的圖書信息。
因為有些書的信息是不完整的,所以在爬取時可能會出現錯誤。我正則表達式寫得也不是很好,很多地方都是會出錯的,比如價格那兒。