爬蟲之獲取當當網全部圖書

本文轉載自查看原文 2017-04-02 17:10 3071 python/ crawl

#encoding:utf-8
#
#author:wuhao
#

#******

#爬取當當網圖書，未使用框架

#main是主函數

#KindLinks.py和獲取數據信息.py 是2個封裝的類

#KindLinks只有一個方法，它返回的是 listUrl---(name（小分類名稱）,url（小分類對應的鏈接）) LB---(總的分類)

#獲取數據信息有2個方法，---getpage(),getinfo() getpage()返回的是頁碼數，getinfo()返回的是每本書中的信息（書名，評論數，作者，出版社，價格，出版日期）書名我沒有進行進一步的解析，可能比較雜亂

#當當網商品種類鏈接，獲取不同種類的所有圖書

from bs4 import BeautifulSoup

class _FirstPageLinkToGetUrl():
    def __init__(self,opener):
        self.opener=opener
        self.url="http://category.dangdang.com/?ref=www-0-C"


    def getDifferentSeriesBookUrl(self):
        html=self.opener.open(self.url).read().decode("gbk")

        soup=BeautifulSoup(html,"html.parser")
        #類別
        LB = []
        # 字典存儲小類別對應的URL
        #dictUrl = {}
        #
        temp=0
        listUrl=[]
        count=[]
        #outside  ---外層的div
        #_li      ---li層
        for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
            LB.append(outsideDiv.div.a.string)
            temp=0
            dictUrl={}
            for _li in outsideDiv.find("ul").find_all("li"):
                if _li.a.string == "更多":
                    continue
                else:
                   # print(s.a.get("href"), s.a.string)
                    temp+=1
                    dictUrl[_li.a.string] = _li.a.get("href")
            count.append(temp)
            listUrl.append(dictUrl)
        return listUrl,LB

#獲取網頁中包含的圖書的信息
from bs4 import BeautifulSoup
import re
class _GetBookInfo():
    def __init__(self,opener):
        self.opener=opener

    def getPage(self,url):
        html = self.opener.open(url)
        html = html.read().decode("gbk")  # 網頁數據
        with open("test.txt","w") as f:
            f.write(html)
        regex=re.compile("<span>/\d+</span>")
        valueNum=re.findall("\d+",regex.findall(html)[0])
        return int(valueNum[0])

    def getInfo(self,url):

        html = self.opener.open(url).read().decode("gbk")

        soup = BeautifulSoup(html,"html.parser")

        ulTag=soup.find("ul",class_="list_aa listimg",id=True)

        liTag=ulTag.find_all("li",id=True)

        data1=[]
        #遍歷liTag
        temp=0
        for li in liTag:
            data = []
            try:
                data.append(li.find("p",class_="name").string)
                data.append(li.find("p",class_="star").a.string)
                data.append(li.find("p",class_="author").a.string)
                data.append(li.find("p",class_="publishing").a.string)
                data.append(li.find("p",class_="price").span.string)
                data.append(re.findall(r"/ .+ ",str(li.find("p", class_="publishing_time")))[0].replace(" ","").replace("/",""))
                data1.append(data)
            except:continue
        #print(data)
        return data1


#
'''
    def getDifferentSeriesBookUrl(self):
        html=self.opener.open(self.url).read().decode("gbk")

        soup=BeautifulSoup(html)
        #類別
        LB = []
        # 字典存儲小類別對應的URL
        dictUrl = {}
        #outside  ---外層的div
        #_li      ---li層
        for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
            LB.append(outsideDiv.div.a.string)
            for _li in outsideDiv.find("ul").find_all("li"):
                if _li.a.string == "更多":
                    continue
                else:
                   # print(s.a.get("href"), s.a.string)
                    dictUrl[_li.a.string] = _li.a.get("href")

        return dictUrl,LB
'''

#-encoding:utf-8
from 當當網圖書爬取 import 獲取數據信息 as bookInfo
from 當當網圖書爬取 import KindLinks as kls
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import http.cookiejar
import re
import xlwt
import xlrd

def getCorrectUrl(url,page):
    if page==0:  return url
    url=url.replace("m/","m/pg"+str(page)+"-")
    return url


#url,當當網所有商品網頁
url="http://category.dangdang.com/?ref=www-0-C"
#創建實例化對象
Cookie=http.cookiejar.CookieJar()
#創建處理器
CookieHandle=urllib.request.HTTPCookieProcessor(Cookie)
#創建opener
opener=urllib.request.build_opener(CookieHandle)
#模擬瀏覽器登錄
header=\
    {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }
head=[]
for key,value in header.items():
    elem=(key,value)
    head.append(elem)
opener.addheaders=head
#打開一次網頁讓opener具備Cookie
opener.open(url)

#首先獲取相關鏈接從KindLinks
_kls=kls._FirstPageLinkToGetUrl(opener)
#書籍的鏈接數據
bdata=_kls.getDifferentSeriesBookUrl()

bdata_url=bdata[0]          #包含所有需要用的url
bdata_gd=bdata[1]           #大體描述
#bdata_count=bdata[2]        #每取出多少個url,創建一個表格
#把字典轉換為list存儲
bdata_url_name=[]
bdata_url_url=[]
print((list(bdata_url[0].values())))
for key in range(len(bdata_url)):
    bdata_url_url.append(list(bdata_url[key].values()))
    bdata_url_name.append(list(bdata_url[key].keys()))
print(bdata_url_name)
print(bdata_url_url[0])
#實例化對象
bio=bookInfo._GetBookInfo(opener)
#在excel中存儲的格式
StyleinfoInExcel=["書名","評論數","作者","出版社","價格","出版日期"]
book=xlwt.Workbook(encoding="utf-8")
#用於統計總計書的數量
count=0

for _gd in range(len(bdata_url)):
    for _bdata in range(len(bdata_url_name[_gd])):
        page = bio.getPage(bdata_url_url[_gd][_bdata])           #獲取頁碼數
        sheetname=bdata_url_name[_gd][_bdata].replace("/", "-")
        try:
            sheet=book.add_sheet(sheetname=sheetname)
        except:continue
        print(sheetname+"正在寫入...")
        for i in range(len(StyleinfoInExcel)):
            sheet.write(0,i,StyleinfoInExcel[i])
        #進行數據的讀取和寫入
        temp=0
        for CurrentPage in range(1,page,1):                                             #CurrentPage為實際爬取到的網頁頁碼
            try:
                data=bio.getInfo(getCorrectUrl(bdata_url_url[_gd][_bdata],CurrentPage))          #數據保存到data中
                #將數據寫入到Excel
                for i in range(len(data)):
                    temp+=1
                    for j in range (len(data[i])):
                        #print(data[i][j],end=" ")
                        sheet.write(temp,j,data[i][j])
                    count+=1
            except:continue
        print("已寫入"+str(count)+"本書")
        print(sheetname+"寫入完成...\r\n")


        if _bdata==len(bdata_url_name[_gd])-1:
            book.save(bdata_gd[_gd].replace("/","-")+".xls")
            book = xlwt.Workbook(encoding="utf-8")
            print("--------已完成"+bdata_gd[_gd])
        #
print("寫入完成，共計"+str(count)+"本書")

#沒有進行並發處理，所以在爬去過程中耗時很長，這里的圖書寫入的數量一共是1280000萬冊，還有部分未爬取完，此程序尚需進一步的完善。
#此程序只是本人作為練手而寫的小程序，數據並沒有清洗的很干凈

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 當當網爬蟲 Python網絡爬蟲——當當網【python爬蟲】爬取當當網TOP500圖書暢銷榜 java爬蟲，爬取當當網數據 python爬蟲案例-爬取當當網數據爬取當當網圖書銷售排行榜（Python） Python爬蟲庫Scrapy入門1--爬取當當網商品數據 scrapy爬取當當網 python爬蟲06 | 你的第一個爬蟲，爬取當當網 Top 500 本五星好評書籍 Dubbox：來自當當網的SOA服務框架