python爬取豌豆莢中的詳細信息並存儲到SQL Server中

本文轉載自查看原文 2017-09-16 16:41 1160 爬蟲/ python學習

　　買了本書《精通Python網絡爬蟲》，看完了第6章，我感覺我好像可以干點什么；學的不多，其中的筆記我放到了GitHub上：https://github.com/NSGUF/PythonLeaning/blob/master/examle-urllib.py，因為我用的python3.0，所以，在爬取數據的時候只用到了一個包：urllib。該博文的源碼：https://github.com/NSGUF/PythonLeaning/blob/master/APPInfo.py

　　思路：首先，如果進入了豌豆莢的首頁可以看到，其圖如圖1，主要是分為安卓軟件和安卓游戲，所以只需得到這里面所有的鏈接即可，如影音播放，系統工具等；

圖1

　　當點擊隨意一個鏈接時，顯示圖2，如圖可見，該頁面會顯示每個軟件的基本信息，並且會鏈接到其詳細信息上，這時，如果能獲取到詳細信息的鏈接就能得到所需的基本信息了；

圖2

　　由於該網站是分頁的，所以必須得到頁數，由圖可見，每個頁面的最大都是42，而具體卻沒有到42，所以后面會顯示圖4.沒有更多內容了所以，可以循環42次；

圖3

圖4

　　綜上所述：可得獲取圖1中畫下划線的鏈接，同樣包括安卓游戲中的該鏈接

def getAllLinks(url):#獲取首頁鏈接的所有子鏈接
    html1=str(urllib.request.urlopen(url).read())
    pat='<a class="cate-link" href="(http://.+?")>'
    allLink=re.compile(pat).findall(html1)
    allLinks=[]
    for link in allLink:
        allLinks.append(link.split('"')[0])
    return allLinks

　　獲取圖2中圈起來的鏈接，因為其有頁碼，所以得加上頁碼

def getAllDescLinks(url,page):#獲取子鏈接中所有app指向的鏈接
    url=url+'/'+str(page)
    print(url)
    html1=str(urllib.request.urlopen(url).read().decode('utf-8'))
    pat2='<ul id="j-tag-list" class="app-box clearfix">[\s\S]*<div class="pagination">'
    allLink=str(re.compile(pat2).findall(html1)).strip('\n').replace(' ','').replace('\\n','').replace('\\t','')
    allLink=allLink.split('<divclass="icon-wrap"><ahref="')
    allLinks=[]
    for i in range(1,len(allLink)):
        allLinks.append(allLink[i].split('"><imgsrc')[0])
    allLinks=list(set(allLinks))
    return allLinks

　　獲取詳細信息中的信息：

def getAppName(html):#獲取app名字
    pat='<span class="title" itemprop="name">[\s\S]*</span>'
    string=str(re.compile(pat).findall(html))
    name=''
    if string!='[]':
        name=string.split('>')[1].split('<')[0]
    return name
def getDownNumber(html):#下載次數
    pat='<i itemprop="interactionCount"[\s\S]*</i>'
    string=str(re.compile(pat).findall(html))
    num=''
    if string!='[]':
        num=string.split('>')[1].split('<')[0]
    return num
def getScore(html):#評分
    pat='<span class="item love">[\s\S]*<i>[\s\S]*好評率</b>'
    string=str(re.compile(pat).findall(html))
    score=''
    if string!='[]':
        score=string.split('i')[2].split('>')[1].split('<')[0]
    return score
def getIconLink(html):#app中icom的圖片鏈接
    pat='<div class="app-icon"[\s\S]*</div>'
    image=str(re.compile(pat).findall(html))
    img=''
    if image!='[]':
        img='http://'+str(image).split('http://')[1].split('.png')[0]+'.png'
    return img
def getVersion(html):#版本
    pat='版本</dt>[\s\S]*<dt>要求'
    version=str(re.compile(pat).findall(html))
    if version!='[]':
        version=version.split('&nbsp;')[1].split('</dd>')[0]
    return version
def getSize(html):#大小
    pat='大小</dt>[\s\S]*<dt>分類'
    size=str(re.compile(pat).findall(html))
    if size!='[]':
        size=size.split('<dd>')[1].split('<meta')[0].strip('\n').replace(' ','').replace('\\n','')#strip刪除本身的換行，刪除中文的空格，刪除\n字符
    return size

def getImages(html):#所有截屏的鏈接
    pat='<div data-length="5" class="overview">[\s\S]*</div>'
    images1=str(re.compile(pat).findall(html))
    pat1='http://[\s\S]*.jpg'
    images=[]
    images1=str(re.compile(pat1).findall(images1))
    if images1!='[]':
        images1=images1.split('http://')
        for i in range(1,len(images1)):
            images.append(images1[i].split('.jpg')[0]+'.jpg')
    return images
def getAbstract(html):#簡介
    pat='<div data-originheight="100" class="con" itemprop="description">[\s\S]*<div class="change-info">'
    abstract=str(re.compile(pat).findall(html))
    if abstract=='[]':
        pat='<div data-originheight="100" class="con" itemprop="description">[\s\S]*<div class="all-version">'
        abstract=str(re.compile(pat).findall(html))
    if abstract!='[]':
        abstract=abstract.split('description">')[1].split('</div>')[0].replace('<br>','').replace('<br />','')#strip刪除本身的換行，刪除中文的空格，刪除\n字符
    return abstract
def getUpdateTime(html):#更新時間
    pat='<time id="baidu_time" itemprop="datePublished"[\s\S]*</time>'
    updateTime=str(re.compile(pat).findall(html))
    if updateTime!='[]':
        updateTime=updateTime.split('>')[1].split('<')[0]
    return updateTime
def getUpdateCon(html):#更新內容
    pat='<div class="change-info">[\s\S]*<div class="all-version">'
    update=str(re.compile(pat).findall(html))
    if update!='[]':
        update=update.split('"con">')[1].split('</div>')[0].replace('<br>','').replace('<br />','')#strip刪除本身的換行，刪除中文的空格，刪除\n字符
    return update
def getCompany(html):#開發公司
    pat='<span class="dev-sites" itemprop="name">[\s\S]*</span>'
    com=str(re.compile(pat).findall(html))
    if com!='[]':
        com=com.split('"name">')[1].split('<')[0]#strip刪除本身的換行，刪除中文的空格，刪除\n字符
    return com 
def getClass(html):#所屬分類
    pat='<dd class="tag-box">[\s\S]*<dt>TAG</dt>'
    classfy1=str(re.compile(pat).findall(html))
    classfy=[]
    if classfy1!='[]':
        classfy1=classfy1.split('appTag">')
        for i in range(1,len(classfy1)):
            classfy.append(classfy1[i].split('<')[0])
    return classfy 
def getTag(html):#標有的Tag
    pat='<div class="side-tags clearfix">[\s\S]*<dt>更新</dt>'
    tag1=str(re.compile(pat).findall(html))
    tag=[]
    if tag1!='[]':
        tag1=tag1.strip('\n').replace(' ','').replace('\\n','').split('</a>')
        for i in range(0,len(tag1)-1):
            tag.append(tag1[i].replace('<divclass="side-tagsclearfix">','').replace('<divclass="tag-box">','').replace('</div>','').split('>')[1])
    return tag 
def getDownLink(html):#下載鏈接
    pat='<div class="qr-info">[\s\S]*<div class="num-list">'
    link=str(re.compile(pat).findall(html))
    if link!='[]':
        link=link.split('href="http://')[1].split('" rel="nofollow"')[0]
    return link 
def getComment(html):#評論內容（只包含10條，因為網頁只顯示有限）
    pat='<ul class="comments-list">[\s\S]*<div class="hot-tags">'
    comm=str(re.compile(pat).findall(html))
    comms=''
    eval_descs=[]
    if comm!='[]':
        comms=comm.strip('\n').replace(' ','').replace('\\n','').split('<liclass="normal-li">')
        for i in range(1,len(comms)-1):
            userName=comms[i].split('name">')[1].split('<')[0]
            time=comms[i].split('</span><span>')[1].split('<')[0]
            evalDesc=comms[i].split('content"><span>')[1].split('<')[0]
            eval_desc={'userName':userName,'time':time,'evalDesc':evalDesc}
            eval_descs.append(eval_desc)
    # comm=comm.split('href="http://')[1].split('" rel="nofollow"')[0]
    return eval_descs

　　將信息插入SQL數據庫，這里注意execute后面用的占位符是？，之前我看了很多其他的資料，用的是%s，報錯了，最無語的是報錯居然還亂碼了。

def insertAllInfo(name,num,icon,score,appversion,size,images,abstract,updateTime,updateCon,com,classfy,tag,downLink,comm):#插入SQL數據庫
    import pyodbc
    conn = pyodbc.connect('DRIVER={SQL Server};SERVER=127.0.0.1,1433;DATABASE=Test;UID=sa;PWD=123')  
    #連接之后需要先建立cursor：
    cursor = conn.cursor()
    try:
        cursor = conn.cursor()
        cursor.execute('insert into tb_wandoujia(name,num,icon,score,appversion,size,images,abstract,updateTime,updateCon,com,classfy,tag,downLink,comm) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',(name,num,icon,score,appversion,size,images,abstract,updateTime,updateCon,com,classfy,tag,downLink,comm))
        conn.commit()# 不執行不能插入數據
        print('成功')
    except Exception as e:
        print(str(e))
    finally:
        conn.close()

　　數據庫創建代碼如下：

create database Test


CREATE TABLE [dbo].[tb_wandoujia](
    [Id] [int] IDENTITY(1,1) NOT NULL,
    [name] [varchar](100) NULL,
    [num] [varchar](100) NULL,
    [icon] [varchar](200) NULL,
    [score] [varchar](10) NULL,
    [appversion] [varchar](20) NULL,
    [size] [varchar](20) NULL,
    [images] [varchar](2000) NULL,
    [abstract] [varchar](2000) NULL,
    [updateTime] [varchar](20) NULL,
    [updateCon] [varchar](2000) NULL,
    [com] [varchar](50) NULL,
    [classfy] [varchar](200) NULL,
    [tag] [varchar](300) NULL,
    [downLink] [varchar](200) NULL,
    [comm] [varchar](5000) NULL,
PRIMARY KEY CLUSTERED 
(
    [Id] ASC
)WITH (PAD_INDEX  = OFF, STATISTICS_NORECOMPUTE  = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS  = ON, ALLOW_PAGE_LOCKS  = ON) ON [PRIMARY]
) ON [PRIMARY]

GO

SET ANSI_PADDING OFF
GO

　　調用獲取所有信息、打印並插入數據庫：

def getAllInfo(url):#獲取所有信息
    html1=str(urllib.request.urlopen(url).read().decode('utf-8'))
    name=getAppName(html1)
    print('名稱:',name)
    if name=='':
        return 
    num=str(getDownNumber(html1))
    print('下載次數:',num)
    icon=str(getIconLink(html1))
    print('log鏈接:',icon)
    score=str(getScore(html1))
    print('評分:',score)
    version=str(getVersion(html1))
    print('版本:',version)
    size=str(getSize(html1))
    print('大小:',size)
    images=str(getImages(html1))
    print('截圖:',images)
    abstract=str(getAbstract(html1))
    print("簡介:",abstract)
    updateTime=str(getUpdateTime(html1))
    print('更新時間:',updateTime)
    updateCon=str(getUpdateCon(html1))
    print('更新內容:',updateCon)
    com=str(getCompany(html1))
    print('公司:',com)
    classfy=str(getClass(html1))
    print('分類:',classfy)
    tag=str(getTag(html1))
    print('Tag:',tag)
    downLink=str(getDownLink(html1))
    print('下載鏈接:',downLink)
    comm=str(getComment(html1))
    print('評價:',comm)
    if name!='':
        insertAllInfo(name,num,icon,score,version,size,images,abstract,updateTime,updateCon,com,classfy,tag,downLink,comm)

　　最后，循環調用，獲取全部的信息：

for link in getAllLinks(url):
    print(link)
    for i in range(1,42):#由於豌豆莢給的最大是42頁，所以這里用42，反正如果沒有42，也會很快
        print(i)
        for descLink in getAllDescLinks(link,i):
            print(descLink)
            getAllInfo(descLink)

　　最后打印的結果如下圖：

　　存儲到sql數據庫的圖片如下：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 以豌豆莢為例，用 Scrapy 爬取分類多級頁面豌豆莢逆向分析【Python實戰】Scrapy豌豆莢應用市場爬蟲利用Scrapy爬取所有知乎用戶詳細信息並存至MongoDB 豌豆莢進程與adb端口沖突 ADB server didn't ACK fail to start daemon（安裝豌豆莢之后）【雜症】一個豌豆莢引發的血案——關於ADB server didn't ACK的問題 Python爬取疫情數據並存入mysql中 python爬蟲：爬取鏈家深圳全部二手房的詳細信息爬取京東上商品的所有詳細信息