python爬取漫畫


抓取漫畫的網址是:sf互動傳媒

抓取漫畫的由來也是看了知乎上有人說用爬取漫畫,然后自己也玩玩

首頁中每個漫畫的url是類似這樣存儲的:

<tr>
     <td height="30" align="center" bgcolor="#FFFFFF">
       <a href="http://comic.sfacg.com/HTML/KOL/" target="_blank">K.O.I 偶像之王</a>
     </td>
</tr>

然后用lxml通過cssselect(tr>td>a)將能用到的解析出來,然后解析出來會有很多其他的頁面的url和信息,然后我是通過url中包含"/mh/"或者是"/HTML/"進行過濾的

比較蠢的辦法了

然后通過對象,將過濾出來的漫畫的url和漫畫的名字保存在一個這樣的類中,然后通過列表進行存儲

class Cartoon():
    url = None
    name = None

然后用隨便一個漫畫作為例子:勇者赫魯庫

漫畫一共很多章,並且所有章的信息都在如下標簽中包含

<ul class="serialise_list Blue_link2">....</ul>

然后通過BS將每一章的信息進行存儲,然后可以看到其完整的url是:http://comic.sfacg.com/HTML/YZHLK/096/

然后每一章會有很多的page,並且每一章的內容是ajax進行加載的,然后從檢查->網絡中可以看到有一個這樣的請求

然后請求的response里面包含本章的所有圖片,然后只需將在每一章的頁面中將.js接口找到,然后將本章的圖片信息存儲,然后就可以得到本章的圖片信息,然后存儲到本地

# -*- coding: utf-8 -*-
import re
import urllib
import urllib2
import os
import stat
import itertools
import re
import sys
import requests
import json
import time
import socket
import urlparse
import csv
import random
from datetime import datetime, timedelta
import lxml.html


from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product
import sys
reload(sys)
sys.setdefaultencoding('utf8')
URL = 'http://comic.sfacg.com'
picture = 'http://coldpic.sfacg.com'


class Cartoon():
    url = None
    name = None

def download(url, user_agent='wswp', num_try=2):

    headers = {'User_agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Download error', e.reason
        html = None
        if num_try > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_try - 1)
            elif e.code == 403:
                return None
    return html

def get_section_url(url):
    html = download(url)
    if html == None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find_all(name='ul', attrs={'class': 'serialise_list Blue_link2'})
    res = r'<a.*?href="([^"]*)".*?>([\S\s]*?)</a>'
    links = re.findall(res, str(results),re.S | re.M)
    return links


def get_section_page(url):

    html = download(url)
    if html == None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find_all(name='script', attrs={'type': 'text/javascript'})
    tt = len(results)
    js = results[tt-1]
    mm = js.get('src')
    if mm == None:
        result = soup.find_all(name='script', attrs={'language': 'javascript'})
        js1 = result[1]
        mm = js1.get('src')
    html1 = download(URL+mm)
    list = html1.split(';')
    List = []
    for each in list:
        if 'picAy[' in each:
            src = each.split('=')
            List.append(picture+src[1][2:-1])

    return List


def download_cartoon(url, cartoon_name,Section,num):

    path = "自己定義的路徑"+cartoon_name

    if not os.path.exists(path):
        os.mkdir(path)
    path = path + "/"+Section
    if not os.path.exists(path):
        os.mkdir(path)
    content = requests.get(url).content
    with open(path + '/' + str(num) + '.jpg', 'wb') as f:
        f.write(content)
    print "Downloading cartoon_name " + path + str(num)+ "下載完成"
    f.close()

if __name__ == '__main__':
    cartoon_list = []

    html = download(URL)
    tree = lxml.html.fromstring(html)
    results = tree.cssselect('tr > td > a')
    for each in results:
        ti = each.get('href')
        if '/mh/' in ti or '/HTML/' in ti:
            if each.text_content() != "":
                cartoon = Cartoon()
                cartoon.url = each.get('href')
                cartoon.name = each.text_content().replace(' ','')
                cartoon_list.append(cartoon)

    for each in cartoon_list:
        print each.url
        print each.name
        links = get_section_url(each.url)
        links = list(reversed(links))
        section = 0
        for link in links:
            ul = URL + link[0]
            List = []
            List = get_section_page(ul)
            section = section + 1
            Section = r''+ str(section) + r''
            num = 1
            for mm in List:
                #print mm
                download_cartoon(mm,each.name,Section,num)
                num = num + 1
            print each.name + Section + "下載完成"+str(num-1)+""

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM