python爬取網站信息


 一.爬取網站數據

大體思路,采用requests模塊爬取頁面源代碼,處理網頁反爬機制(加入headers模擬人工訪問瀏覽器),再采用re模塊進行信信息處理分割,取得我所需要的信息。整合為列表方便下一步處理。

 1 import re
 2 import requests
 3 # 請求頁面
 4 url = 'https://b.faloo.com/y_0_1.html'
 5 headers = {
 6     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 7     'Accept-Encoding': 'gzip, deflate,br',
 8     'Accept-Language': 'zh-CN,zh;q=0.9',
 9     'Connection': 'keep-alive',
10     'Cache-Control': 'max-age=0',
11     'Cookie': 'host4chongzhi=http%3a%2f%2fb.faloo.com%2f; Hm_lvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727329; curr_url=https%3A//b.faloo.com/y_0_1.html; Hm_lpvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727414',
12     'Host': 'b.faloo.com',
13     'Referer': 'https://b.faloo.com/Rank_1.html',
14     'Upgrade-Insecure-Requests': 1,
15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
16   }
17 response = requests.get(url).text
18 # 過濾數據
19 print(("開始爬取網站數據!"))
20 a_pattern = re.compile(r'<h1 class="fontSize17andHei" title="(.*?)">')
21 title = re.findall(a_pattern, response)
22 # print(title)
23 print(("小說標題爬取完成!"))
24 author_pattern = re.compile(r'<span class="fontSize14andsHui">(.*?)</span>')
25 span_text = re.findall(author_pattern, response)
26 #print(span_text)
27 
28 author_list = []
29 for item in span_text:
30     a_title_pattern = re.compile(r'<a.*?>(.*?)</a>')
31     a_title = re.findall(a_title_pattern, item)
32     #print(a_title)
33     image_pattern = re.compile(r'<img.*?')
34     if image_pattern.match(a_title[0]) is not None:
35         author = re.findall(r'<a.*title="(.*?)" target="_blank">', a_title[0])[0]
36     else:
37         author = a_title[0]
38     author_list.append(author)
39 # print(len(author_list))
40 print("作者信息爬取完成!")
41 novel_type_pattern = re.compile(r'<span class="fontSize14andHui">(.*?)</span>')
42 novel_type_list = re.findall(novel_type_pattern, response)
43 novel_type_li = []
44 for novel_type_name in novel_type_list:
45     novel_type = re.findall(r'<a.*title="(.*?)".*', novel_type_name)
46     novel_type_li.append(novel_type[0])
47 # print(len(novel_type_li))
48 print("文章類型信息爬取完成!")
49 novel_date = re.compile(r'<font color=".*?">(.*?)</font>')
50 novel_date_list = re.findall(novel_date, response)
51 # print(novel_date_list)
52 print("文章更新爬取完成!")
53 novel_click = re.compile(r'<span>周點擊:(.*?)</span>')
54 novel_click_list = re.findall(novel_click, response)
55 # print(novel_click_list)
56 print("文章周點擊數爬取完成!")
57 novel_number = re.compile(r'<span>字數:(.*?)</span>')
58 novel_number_list = re.findall(novel_number, response)
59 # print(novel_number_list)
60 print("文章字數爬取完成!")
61 lis=[]
62 for i in range(len(novel_date_list)):
63     li = []
64     li.append(title[i])
65     li.append(author_list[i])
66     li.append(novel_type_li[i])
67     li.append(novel_click_list[i])
68     li.append(novel_number_list[i])
69     li.append(novel_date_list[i])
70     lis.append(li)
71 # print(lis)

 二.將爬取數據存入Execl表格

 1 import xlwt
 2 
 3 print("開始創建Execl表格!")
 4 f=xlwt.Workbook()
 5 sheet1 = f.add_sheet('小說',cell_overwrite_ok=True)
 6 row0 = ["書名","作者","小說類型","日期","字數","周點擊數"]
 7 for i in range(len(row0)):
 8     sheet1.write(0,i,row0[i])
 9 for i2 in lis:
10     for i3 in range(len(i2)):
11         sheet1.write(lis.index(i2)+1,i3,i2[i3])
12     f.save('飛盧小說信息網.xls')
13 print("Execl表格創建完成!")

 三.將數據寫入數據庫中

import pymysql
# li = [[4,'趙六','物理',97],[5,'孫七','化學',91],[6,'王八','生物',93]] # create table novel(id int(8) not null auto_increment primary key, # book_name varchar(100),author varchar(25),book_type varchar(8), # week_clicks varchar(25),word_counts varchar(30),book_date varchar(10)); con = pymysql.connect(host='192.168.135.165',user='root',password='123456',database='xiaoshuo') cur = con.cursor() print("數據庫連接成功!") # cur.execute('use xiaoshu') for i in lis: for j in range(0,1): cur.execute("insert into novel values(NULL,'%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5])) cur.execute('select * from novel') # a=cur.fetchall() # print(a) print("數據寫入完成!") con.commit() con.close()

四.結果

將以上代碼整合,就是如下效果

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM