使用Python爬取mobi格式電紙書


最近做了個微信推送kindle電子書的公眾號:kindle免費書庫

不過目前電子書不算非常多,所以需要使用爬蟲來獲取足夠書籍。

於是,寫了以下這個爬蟲,來爬取kindle114的電子書。

值得注意的地方:

當爬取數過大時,由於對方有開啟放抓取,會返回一個javascript而非原始的html,所以我使用

的PyV8來執行這段js從而拿到真正的地址。

目前存在的問題:

正則式寫得還不夠好,畢竟是第一次正式寫爬蟲:)

無法下載需要購買的附件

爬蟲為單線程,爬完整個網站速度慢。我有試過轉成多進程,但是貌似由於不能同時登陸,大多數

爬蟲進程都無法正常爬取@@

# -*- coding: utf-8 -*-
import urllib2
import re
import requests
import os
import hashlib

def fuckJS(js):
    import PyV8
    import re
    #去掉<script>標簽
    js=js[31:-9]
    for st in ['window','location',"'assign'","'href'","'replace'"]:
        equal=re.findall('[_A-Za-z0-9 =]+%s;'%st,js)#找到變量賦值等式
        if equal==[]:#有可能沒有
            continue
        else:
            equal=equal[0]
        var=equal.split('=')[0].strip()#找出變量名
        #把等式干掉
        js=js.replace(equal,'')
        #把變量替換成它真正的意思
        js=js.replace(var,st)
        #把['xx'] 替換成 .xx
        js=js.replace("['%s']"%st.strip("'"),'.%s'%st.strip("'"))
    #將 window.href= 后的內容踢掉,因為當PyV8只輸出最后一個等式的值
    if re.findall('window\.href=.+',js)!=[]:
        js=js.replace(re.findall('window\.href=.+',js)[0],'')
    #刪掉location.xxx=
    js=js.replace('location.href=','').replace('location.replace','').replace('location.assign','')
    #交給你了-v-
    ctxt2 = PyV8.JSContext()
    ctxt2.enter()
    #print ctxt2.eval(js)
    trueAddr = ctxt2.eval(js)
    print trueAddr
    return trueAddr

def downloadMobi(name, url):
    #去掉windows下不合法的文件名
    unlawName = '<>/\\|:""*?'
    for i in unlawName:
        name = name.replace(i, '')
    #正則表達式寫的不夠好導致的問題@@
    if name.count(' &nbsp;img src=templateyeei_dream1cssyeeidigest_1.gif class=vm alt= title= ') > 0:
        name = name.split(' &nbsp')[0]+'.mobi'
    #避免重復下載
    if os.path.exists('D:\Kindle114SpiderDownload\\' + name):
        print 'already have', name
        return
    url = url.split(' ')[0]
    s = requests.session()
    username = '你的用戶名'
    password = '你的密碼'
    passwordMd5 = hashlib.md5(password).hexdigest()
    data = {'formhash': '23cd6c29', 'referer': '','username': username, 'password': passwordMd5, 'questionid':'0', 'answer':''}
    res=s.post('http://www.kindle114.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LYn7n&inajax=1',data)

    #res = s.get('http://www.kindle114.com/forum.php?mod=attachment&aid=MTQ2NTB8ZjhkNjY3NmF8MTQxNjg5OTYxOXw0NDIxfDczNjI%3D')
    try:
        res = s.get(url, timeout = 200)
    except:
        print 'time out for ', name
    #print 'content[:50]'
    #print res.content[:50]
    if res.content.count('<!DOCTYPE html') > 0:
        print '!!!!!!!!!!!!!!!!!not a mobi, this file need gold coin!!!!!!!!!!!!!!!'
        return
    try:
        with open('D:\\Kindle114SpiderDownload\\' + name, "wb") as code:
                code.write(res.content)
    except:
        print '!!!!!!!!!!!!!!!!!!!!!遇到不合法文件名!!!!!!!!!!!!!!!!!!', name

def spiderThread(url, threadName):
    req = urllib2.urlopen(url, timeout = 10)
    text = req.read()
    if text.count('<!DOCTYPE html') == 0:
        js = text
        trueURL = 'http://www.kindle114.com/' + fuckJS(js)
        print 'trueURL', trueURL
        req = urllib2.urlopen(trueURL)
        text = req.read()
    
    #href = '<a href="(.*?)" onmouseover="showMenu({\'ctrlid\':this.id,\'pos\':\'12\'})" id=.*?target="_blank">(.*?)</a>'
    href = '<a href="(.*?)".*?target="_blank">(.*?)</a>'
    href_re = re.compile(href)
    href_info = href_re.findall(text)
    
    bookSum = 0
    for i in href_info:
        if i[1].count('.mobi') > 0:
            bookSum+=1
    if bookSum == 0:
        print '!!!bookSum = 0!!!!', text[:100]
    if bookSum == 1:
        print 'only one book in this thread'
        bookFileName = threadName + '.mobi'
        for i in href_info:
            if i[1].count('.mobi') > 0:
                link = i[0].replace('amp;','')
                break
        print link, bookFileName
        downloadMobi(bookFileName, link)
    else:
        print str(bookSum), 'in this thread'
        for i in href_info:
            if i[1].count('.mobi') > 0:
                link = i[0].replace('amp;','')
                bookFileName = i[1]
                print link, bookFileName
                downloadMobi(bookFileName, link)


for pageNum in range(1, 125):    
    url = 'http://www.kindle114.com/forum.php?mod=forumdisplay&fid=2&filter=sortid&sortid=1&searchsort=1&geshi=1&page=' + str(pageNum)
    print '=============url', url,'==============='
    try:
        req = urllib2.urlopen(url, timeout = 10)
    except:
        print 'page time out', url
    text = req.read()
    href = '<h4><a href="(.*?)" target="_blank" class="xst">(.*?)<span class="xi1">'
    href_re = re.compile(href)
    href_info = href_re.findall(text)
    for i in href_info:
        print i[0], i[1]
        url = 'http://www.kindle114.com/'+i[0]
        threadName = i[1]
        try:
            spiderThread(url, threadName)
        except Exception , e:
            print '!!!!!!!!!!!!! Error with ',threadName, url,'!!!!!!!!!!!!!!!!'
            print e
raw_input('finish all!!!')

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM