python爬取网站信息


 一.爬取网站数据

大体思路,采用requests模块爬取页面源代码,处理网页反爬机制(加入headers模拟人工访问浏览器),再采用re模块进行信信息处理分割,取得我所需要的信息。整合为列表方便下一步处理。

 1 import re
 2 import requests
 3 # 请求页面
 4 url = 'https://b.faloo.com/y_0_1.html'
 5 headers = {
 6     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 7     'Accept-Encoding': 'gzip, deflate,br',
 8     'Accept-Language': 'zh-CN,zh;q=0.9',
 9     'Connection': 'keep-alive',
10     'Cache-Control': 'max-age=0',
11     'Cookie': 'host4chongzhi=http%3a%2f%2fb.faloo.com%2f; Hm_lvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727329; curr_url=https%3A//b.faloo.com/y_0_1.html; Hm_lpvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727414',
12     'Host': 'b.faloo.com',
13     'Referer': 'https://b.faloo.com/Rank_1.html',
14     'Upgrade-Insecure-Requests': 1,
15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
16   }
17 response = requests.get(url).text
18 # 过滤数据
19 print(("开始爬取网站数据!"))
20 a_pattern = re.compile(r'<h1 class="fontSize17andHei" title="(.*?)">')
21 title = re.findall(a_pattern, response)
22 # print(title)
23 print(("小说标题爬取完成!"))
24 author_pattern = re.compile(r'<span class="fontSize14andsHui">(.*?)</span>')
25 span_text = re.findall(author_pattern, response)
26 #print(span_text)
27 
28 author_list = []
29 for item in span_text:
30     a_title_pattern = re.compile(r'<a.*?>(.*?)</a>')
31     a_title = re.findall(a_title_pattern, item)
32     #print(a_title)
33     image_pattern = re.compile(r'<img.*?')
34     if image_pattern.match(a_title[0]) is not None:
35         author = re.findall(r'<a.*title="(.*?)" target="_blank">', a_title[0])[0]
36     else:
37         author = a_title[0]
38     author_list.append(author)
39 # print(len(author_list))
40 print("作者信息爬取完成!")
41 novel_type_pattern = re.compile(r'<span class="fontSize14andHui">(.*?)</span>')
42 novel_type_list = re.findall(novel_type_pattern, response)
43 novel_type_li = []
44 for novel_type_name in novel_type_list:
45     novel_type = re.findall(r'<a.*title="(.*?)".*', novel_type_name)
46     novel_type_li.append(novel_type[0])
47 # print(len(novel_type_li))
48 print("文章类型信息爬取完成!")
49 novel_date = re.compile(r'<font color=".*?">(.*?)</font>')
50 novel_date_list = re.findall(novel_date, response)
51 # print(novel_date_list)
52 print("文章更新爬取完成!")
53 novel_click = re.compile(r'<span>周点击:(.*?)</span>')
54 novel_click_list = re.findall(novel_click, response)
55 # print(novel_click_list)
56 print("文章周点击数爬取完成!")
57 novel_number = re.compile(r'<span>字数:(.*?)</span>')
58 novel_number_list = re.findall(novel_number, response)
59 # print(novel_number_list)
60 print("文章字数爬取完成!")
61 lis=[]
62 for i in range(len(novel_date_list)):
63     li = []
64     li.append(title[i])
65     li.append(author_list[i])
66     li.append(novel_type_li[i])
67     li.append(novel_click_list[i])
68     li.append(novel_number_list[i])
69     li.append(novel_date_list[i])
70     lis.append(li)
71 # print(lis)

 二.将爬取数据存入Execl表格

 1 import xlwt
 2 
 3 print("开始创建Execl表格!")
 4 f=xlwt.Workbook()
 5 sheet1 = f.add_sheet('小说',cell_overwrite_ok=True)
 6 row0 = ["书名","作者","小说类型","日期","字数","周点击数"]
 7 for i in range(len(row0)):
 8     sheet1.write(0,i,row0[i])
 9 for i2 in lis:
10     for i3 in range(len(i2)):
11         sheet1.write(lis.index(i2)+1,i3,i2[i3])
12     f.save('飞卢小说信息网.xls')
13 print("Execl表格创建完成!")

 三.将数据写入数据库中

import pymysql
# li = [[4,'赵六','物理',97],[5,'孙七','化学',91],[6,'王八','生物',93]] # create table novel(id int(8) not null auto_increment primary key, # book_name varchar(100),author varchar(25),book_type varchar(8), # week_clicks varchar(25),word_counts varchar(30),book_date varchar(10)); con = pymysql.connect(host='192.168.135.165',user='root',password='123456',database='xiaoshuo') cur = con.cursor() print("数据库连接成功!") # cur.execute('use xiaoshu') for i in lis: for j in range(0,1): cur.execute("insert into novel values(NULL,'%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5])) cur.execute('select * from novel') # a=cur.fetchall() # print(a) print("数据写入完成!") con.commit() con.close()

四.结果

将以上代码整合,就是如下效果

 

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM