一.爬取網站數據
大體思路,采用requests模塊爬取頁面源代碼,處理網頁反爬機制(加入headers模擬人工訪問瀏覽器),再采用re模塊進行信信息處理分割,取得我所需要的信息。整合為列表方便下一步處理。
1 import re 2 import requests 3 # 請求頁面 4 url = 'https://b.faloo.com/y_0_1.html' 5 headers = { 6 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 7 'Accept-Encoding': 'gzip, deflate,br', 8 'Accept-Language': 'zh-CN,zh;q=0.9', 9 'Connection': 'keep-alive', 10 'Cache-Control': 'max-age=0', 11 'Cookie': 'host4chongzhi=http%3a%2f%2fb.faloo.com%2f; Hm_lvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727329; curr_url=https%3A//b.faloo.com/y_0_1.html; Hm_lpvt_6d308f6626f6d0864b6bb4f348f2b5e5=1648727414', 12 'Host': 'b.faloo.com', 13 'Referer': 'https://b.faloo.com/Rank_1.html', 14 'Upgrade-Insecure-Requests': 1, 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' 16 } 17 response = requests.get(url).text 18 # 過濾數據 19 print(("開始爬取網站數據!")) 20 a_pattern = re.compile(r'<h1 class="fontSize17andHei" title="(.*?)">') 21 title = re.findall(a_pattern, response) 22 # print(title) 23 print(("小說標題爬取完成!")) 24 author_pattern = re.compile(r'<span class="fontSize14andsHui">(.*?)</span>') 25 span_text = re.findall(author_pattern, response) 26 #print(span_text) 27 28 author_list = [] 29 for item in span_text: 30 a_title_pattern = re.compile(r'<a.*?>(.*?)</a>') 31 a_title = re.findall(a_title_pattern, item) 32 #print(a_title) 33 image_pattern = re.compile(r'<img.*?') 34 if image_pattern.match(a_title[0]) is not None: 35 author = re.findall(r'<a.*title="(.*?)" target="_blank">', a_title[0])[0] 36 else: 37 author = a_title[0] 38 author_list.append(author) 39 # print(len(author_list)) 40 print("作者信息爬取完成!") 41 novel_type_pattern = re.compile(r'<span class="fontSize14andHui">(.*?)</span>') 42 novel_type_list = re.findall(novel_type_pattern, response) 43 novel_type_li = [] 44 for novel_type_name in novel_type_list: 45 novel_type = re.findall(r'<a.*title="(.*?)".*', novel_type_name) 46 novel_type_li.append(novel_type[0]) 47 # print(len(novel_type_li)) 48 print("文章類型信息爬取完成!") 49 novel_date = re.compile(r'<font color=".*?">(.*?)</font>') 50 novel_date_list = re.findall(novel_date, response) 51 # print(novel_date_list) 52 print("文章更新爬取完成!") 53 novel_click = re.compile(r'<span>周點擊:(.*?)</span>') 54 novel_click_list = re.findall(novel_click, response) 55 # print(novel_click_list) 56 print("文章周點擊數爬取完成!") 57 novel_number = re.compile(r'<span>字數:(.*?)</span>') 58 novel_number_list = re.findall(novel_number, response) 59 # print(novel_number_list) 60 print("文章字數爬取完成!") 61 lis=[] 62 for i in range(len(novel_date_list)): 63 li = [] 64 li.append(title[i]) 65 li.append(author_list[i]) 66 li.append(novel_type_li[i]) 67 li.append(novel_click_list[i]) 68 li.append(novel_number_list[i]) 69 li.append(novel_date_list[i]) 70 lis.append(li) 71 # print(lis)
二.將爬取數據存入Execl表格
1 import xlwt 2 3 print("開始創建Execl表格!") 4 f=xlwt.Workbook() 5 sheet1 = f.add_sheet('小說',cell_overwrite_ok=True) 6 row0 = ["書名","作者","小說類型","日期","字數","周點擊數"] 7 for i in range(len(row0)): 8 sheet1.write(0,i,row0[i]) 9 for i2 in lis: 10 for i3 in range(len(i2)): 11 sheet1.write(lis.index(i2)+1,i3,i2[i3]) 12 f.save('飛盧小說信息網.xls') 13 print("Execl表格創建完成!")
三.將數據寫入數據庫中
import pymysql
# li = [[4,'趙六','物理',97],[5,'孫七','化學',91],[6,'王八','生物',93]] # create table novel(id int(8) not null auto_increment primary key, # book_name varchar(100),author varchar(25),book_type varchar(8), # week_clicks varchar(25),word_counts varchar(30),book_date varchar(10)); con = pymysql.connect(host='192.168.135.165',user='root',password='123456',database='xiaoshuo') cur = con.cursor() print("數據庫連接成功!") # cur.execute('use xiaoshu') for i in lis: for j in range(0,1): cur.execute("insert into novel values(NULL,'%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5])) cur.execute('select * from novel') # a=cur.fetchall() # print(a) print("數據寫入完成!") con.commit() con.close()
四.結果
將以上代碼整合,就是如下效果