python爬蟲案例-爬取當當網數據


  輸入關鍵字,爬取當當網中商品的基本數據,代碼如下:

 1 # Author:K
 2 import requests
 3 from lxml import etree
 4 from fake_useragent import UserAgent
 5 import re
 6 import csv
 7 
 8 
 9 def get_page(key):
10     for page in range(1,50):
11         url = 'http://search.dangdang.com/?key=%s&act=input&page_index=%s' % (key,page)
12         headers = {
13             'User-Agent':UserAgent().random
14         }
15         response = requests.get(url = url,headers = headers)
16         parse_page(response)
17         print('page %s over!!!' % page)
18 
19 def parse_page(response):
20     tree = etree.HTML(response.text)
21     li_list = tree.xpath('//ul[@class="bigimg"]/li')
22     # print(len(li_list))  # 測試
23     for li in li_list:
24         data = []
25         try:
26             # 獲取書的標題,並添加到列表中
27             title = li.xpath('./p[@class="name"]/a/@title')[0].strip()
28             data.append(title)
29             # 獲取商品鏈接,並添加到列表中
30             commodity_url = li.xpath('./p[@class="name"]/a/@href')[0]
31             data.append(commodity_url)
32             # 獲取價格,並添加到列表中
33             price = li.xpath('./p[@class="price"]/span[1]/text()')[0]
34             data.append(price)
35             # 獲取作者,並添加到列表中
36             author = ''.join(li.xpath('./p[@class="search_book_author"]/span[1]//text()')).strip()
37             data.append(author)
38             # 獲取出版時間,並添加到列表中
39             time = li.xpath('./p[@class="search_book_author"]/span[2]/text()')[0]
40             pub_time = re.sub('/','',time).strip()
41             data.append(pub_time)
42             # 獲取評論數,並添加到列表中
43             comment_count = li.xpath('./p[@class="search_star_line"]/a/text()')[0]
44             # 獲取書本的簡介,並添加到列表中.由於有些書本沒有簡介,所以要用try
45             commodity_detail = ''
46             commodity_detail = li.xpath('./p[@class="detail"]/text()')[0]
47             data.append(commodity_detail)
48         except:
49             pass
50         save_data(data)
51 
52 def save_data(data):
53     writer.writerow(data)
54 
55 
56 def main():
57     key = 'python'  # input('Please input key:')
58     get_page(key)
59 
60 fp = open('當當網.csv','w',encoding = 'utf-8-sig',newline = '')
61 writer = csv.writer(fp)
62 header = ['標題','鏈接','價格','作者','出版時間','評論數','簡介']
63 writer.writerow(header)
64 main()
65 fp.close()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM