#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/8/28 10:05
# @Author : aqiong
# @Site :
# @File : 貼吧爬蟲.py
# @Software: PyCharm
#https://tieba.baidu.com/f?kw=友誼已走到盡頭
#后綴:
#1.請求網頁,獲取數據,2,解析數據 3.存儲數據
import re
import xlwt#存儲數據
from bs4 import BeautifulSoup#分析網頁
import urllib.request,urllib.error#請求網頁
#標題
findTitle = re.compile(r'<a class="j_th_tit".*?title=".*?".*?>(.*?)</a>',re.S)
#主題作者
findauthor = re.compile(r'<span class="tb_icon_author" data-field=.*?title="(.*?)"><i class=.*?</span>')
#發布時間
findTime = re.compile(r'<div class="threadlist_abs threadlist_abs_onlyline">(.*?)</span>')
#發布內容
findContent = re.compile(r'<div class="threadlist_abs threadlist_abs_onlyline">(.*?)</div>',re.S)
#最后回復人
findLastReader = re.compile(r'<span class="tb_icon_author_rely j_replyer" title="(.*?)">')
#最后回復時間
findLastTime=re.compile(r'<span class="threadlist_reply_date pull_right j_reply_data" title="最后回復時間">(.*?)</span>',re.S)
#鎮樓圖
#findImgSrc = re.compile(r' <ul class="threadlist_media j_threadlist_media clearfix">*?src="(.*?).*?</ul>',re.S)
findImgSrc = re.compile(r'<img attr=.*?bpic="(.*?)".*?class="threadlist_pic j_m_pic".*?src=""/>',re.S)
#回復數
findReadNum = re.compile(r'<span class="threadlist_rep_num center_text" title="回復">(\d*?)</span>')
def main():
#1.請求網頁,獲取數據
baseUrl = 'https://tieba.baidu.com/f?kw=%E5%8F%8B%E8%B0%8A%E5%B7%B2%E8%B5%B0%E5%88%B0%E5%B0%BD%E5%A4%B4'
#獲取n條數據
n= 200
datalist = getdata(baseUrl,n)
name = '貼吧帖子.xls'
saveData(datalist,name)
def getdata(baseUrl,n):
datalist = []
for index in range(0,n,50):
#獲取每頁的數據
html = askURL(baseUrl+'&ie=utf-8&pn='+str(index))
#html=html.replace('div class="t_con cleafix"','div class="t_concleafix"').replace('<!--','').replace('-->','')
html = html.replace('<!--', '').replace('-->', '')
soup = BeautifulSoup(html,'html.parser')
#解析數據
# for item in soup.find_all('div',class_='t_concleafix'):
for item in soup.find_all('div',class_='t_con cleafix'):
data = []
item = str(item)
#print(re.findall(findReadNum,item))
#print(re.findall(findImgSrc, item))
data.append(re.findall(findTitle,item)[0])
if re.findall(findauthor,item) :
data.append(re.findall(findauthor, item)[0])
else :
data.append(' ')
if re.findall(findTime,item):
data.append(re.findall(findTime,item))
else:
data.append('')
data.append(re.findall(findContent,item)[0])
data.append(re.findall(findLastReader,item)[0])
data.append(re.findall(findLastTime,item)[0])
data.append(re.findall(findReadNum, item))##這里不理解為什么當下面的圖片鏈接都取出來,取回復數就會報錯,或者取到的很多數據為空,但如果只取一個圖片,則不報錯,且數據顯示正常,初步猜測和存取到excel有關系
imglist = re.findall(findImgSrc,item)
#print(re.findall(findReadNum, item))
if imglist:
data.append(imglist[0])
for item in range(0,len(imglist)):
#print(imglist)
data.append(imglist[item]+'\n')
else:
data.append(' ')
#data.append(re.findall(findReadNum, item))
#print(re.findall(findReadNum,str(item)))
datalist.append(data)
return datalist
def askURL(baseUrl):
html = ''
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36","Accept-Language": "zh-CN,zh;q=0.9"}
request=urllib.request.Request(url=baseUrl,headers=headers)
try:
html = urllib.request.urlopen(request).read().decode('utf-8')
except urllib.error.URLError as e:
print(e)
return html
def saveData(datalist,name):
workbook = xlwt.Workbook(encoding='utf-8',style_compression=0)
worksheet = workbook.add_sheet('帖子內容',cell_overwrite_ok=True)
col = ('標題','主題作者','發布時間','發布內容','最后回復人','最后回復時間','回復數','鎮樓圖')
for i in range(0,len(col)):
worksheet.write(0,i,col[i])
for item in range(0,len(datalist)):
data = datalist[item]
for index in range(0,len(data)):
worksheet.write(item+1,index,data[index])
workbook.save(name)
if __name__ == '__main__':
main()