#encoding:utf-8
#
#author:wuhao
#
#******
#爬取當當網圖書,未使用框架
#main是主函數
#KindLinks.py和 獲取數據信息.py 是2個封裝的類
#KindLinks只有一個方法,它返回的是 listUrl---(name(小分類名稱),url(小分類對應的鏈接)) LB---(總的分類)
#獲取數據信息有2個方法,---getpage(),getinfo() getpage()返回的是頁碼數,getinfo()返回的是每本書中的信息(書名,評論數,作者,出版社,價格,出版日期) 書名我沒有進行進一步的解析,可能比較雜亂
#當當網商品種類鏈接,獲取不同種類的所有圖書 from bs4 import BeautifulSoup class _FirstPageLinkToGetUrl(): def __init__(self,opener): self.opener=opener self.url="http://category.dangdang.com/?ref=www-0-C" def getDifferentSeriesBookUrl(self): html=self.opener.open(self.url).read().decode("gbk") soup=BeautifulSoup(html,"html.parser") #類別 LB = [] # 字典存儲小類別對應的URL #dictUrl = {} # temp=0 listUrl=[] count=[] #outside ---外層的div #_li ---li層 for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"): LB.append(outsideDiv.div.a.string) temp=0 dictUrl={} for _li in outsideDiv.find("ul").find_all("li"): if _li.a.string == "更多": continue else: # print(s.a.get("href"), s.a.string) temp+=1 dictUrl[_li.a.string] = _li.a.get("href") count.append(temp) listUrl.append(dictUrl) return listUrl,LB
#獲取網頁中包含的圖書的信息
from bs4 import BeautifulSoup import re class _GetBookInfo(): def __init__(self,opener): self.opener=opener def getPage(self,url): html = self.opener.open(url) html = html.read().decode("gbk") # 網頁數據 with open("test.txt","w") as f: f.write(html) regex=re.compile("<span>/\d+</span>") valueNum=re.findall("\d+",regex.findall(html)[0]) return int(valueNum[0]) def getInfo(self,url): html = self.opener.open(url).read().decode("gbk") soup = BeautifulSoup(html,"html.parser") ulTag=soup.find("ul",class_="list_aa listimg",id=True) liTag=ulTag.find_all("li",id=True) data1=[] #遍歷liTag temp=0 for li in liTag: data = [] try: data.append(li.find("p",class_="name").string) data.append(li.find("p",class_="star").a.string) data.append(li.find("p",class_="author").a.string) data.append(li.find("p",class_="publishing").a.string) data.append(li.find("p",class_="price").span.string) data.append(re.findall(r"/ .+ ",str(li.find("p", class_="publishing_time")))[0].replace(" ","").replace("/","")) data1.append(data) except:continue #print(data) return data1 # ''' def getDifferentSeriesBookUrl(self): html=self.opener.open(self.url).read().decode("gbk") soup=BeautifulSoup(html) #類別 LB = [] # 字典存儲小類別對應的URL dictUrl = {} #outside ---外層的div #_li ---li層 for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"): LB.append(outsideDiv.div.a.string) for _li in outsideDiv.find("ul").find_all("li"): if _li.a.string == "更多": continue else: # print(s.a.get("href"), s.a.string) dictUrl[_li.a.string] = _li.a.get("href") return dictUrl,LB '''
#-encoding:utf-8
from 當當網圖書爬取 import 獲取數據信息 as bookInfo
from 當當網圖書爬取 import KindLinks as kls
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import http.cookiejar
import re
import xlwt
import xlrd
def getCorrectUrl(url,page):
if page==0: return url
url=url.replace("m/","m/pg"+str(page)+"-")
return url
#url,當當網所有商品網頁
url="http://category.dangdang.com/?ref=www-0-C"
#創建實例化對象
Cookie=http.cookiejar.CookieJar()
#創建處理器
CookieHandle=urllib.request.HTTPCookieProcessor(Cookie)
#創建opener
opener=urllib.request.build_opener(CookieHandle)
#模擬瀏覽器登錄
header=\
{
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}
head=[]
for key,value in header.items():
elem=(key,value)
head.append(elem)
opener.addheaders=head
#打開一次網頁讓opener具備Cookie
opener.open(url)
#首先獲取相關鏈接從KindLinks
_kls=kls._FirstPageLinkToGetUrl(opener)
#書籍的鏈接數據
bdata=_kls.getDifferentSeriesBookUrl()
bdata_url=bdata[0] #包含所有需要用的url
bdata_gd=bdata[1] #大體描述
#bdata_count=bdata[2] #每取出多少個url,創建一個表格
#把字典轉換為list存儲
bdata_url_name=[]
bdata_url_url=[]
print((list(bdata_url[0].values())))
for key in range(len(bdata_url)):
bdata_url_url.append(list(bdata_url[key].values()))
bdata_url_name.append(list(bdata_url[key].keys()))
print(bdata_url_name)
print(bdata_url_url[0])
#實例化對象
bio=bookInfo._GetBookInfo(opener)
#在excel中存儲的格式
StyleinfoInExcel=["書名","評論數","作者","出版社","價格","出版日期"]
book=xlwt.Workbook(encoding="utf-8")
#用於統計總計書的數量
count=0
for _gd in range(len(bdata_url)):
for _bdata in range(len(bdata_url_name[_gd])):
page = bio.getPage(bdata_url_url[_gd][_bdata]) #獲取頁碼數
sheetname=bdata_url_name[_gd][_bdata].replace("/", "-")
try:
sheet=book.add_sheet(sheetname=sheetname)
except:continue
print(sheetname+"正在寫入...")
for i in range(len(StyleinfoInExcel)):
sheet.write(0,i,StyleinfoInExcel[i])
#進行數據的讀取和寫入
temp=0
for CurrentPage in range(1,page,1): #CurrentPage為實際爬取到的網頁頁碼
try:
data=bio.getInfo(getCorrectUrl(bdata_url_url[_gd][_bdata],CurrentPage)) #數據保存到data中
#將數據寫入到Excel
for i in range(len(data)):
temp+=1
for j in range (len(data[i])):
#print(data[i][j],end=" ")
sheet.write(temp,j,data[i][j])
count+=1
except:continue
print("已寫入"+str(count)+"本書")
print(sheetname+"寫入完成...\r\n")
if _bdata==len(bdata_url_name[_gd])-1:
book.save(bdata_gd[_gd].replace("/","-")+".xls")
book = xlwt.Workbook(encoding="utf-8")
print("--------已完成"+bdata_gd[_gd])
#
print("寫入完成,共計"+str(count)+"本書")
#沒有進行並發處理,所以在爬去過程中耗時很長,這里的圖書寫入的數量一共是1280000萬冊,還有部分未爬取完,此程序尚需進一步的完善。
#此程序只是本人作為練手而寫的小程序,數據並沒有清洗的很干凈
