Python 學習筆記---爬取海賊王動漫


 

最近無聊整理的爬蟲代碼,可以自動爬取騰訊動漫的任意漫畫,思路如下:

1. 先獲取想下載的動漫url, 這里用了 getUrls ,直接獲取動漫的最后一章

2. 然后進入到該動漫去獲取要下載的圖片url 

3. 下載到本地

import os
import random
import time
from random import randint

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
import urllib.request as urllib2

ROOT_URL = "http://ac.qq.com"
target_url = [
    ROOT_URL + "/Comic/comicInfo/id/505430",  # 海賊王
]
ua_list = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]

user_agent=random.choice(ua_list)
dir_path="D:/py/海賊王/"

def getImageUrls(comic_url):
    '''
    通過Selenium和Phantomjs獲取動態生成的數據
    '''
    urls = []

    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/4.0 (compatible; MSIE 5.5; windows NT)")
    browser = webdriver.PhantomJS(executable_path=r"E:\py\littlepy\tencent_cartoon\phantomjs-2.1.1-windows\bin\phantomjs.exe",
                                  desired_capabilities=dcap)
    browser.get(comic_url)

    imgs = browser.find_elements_by_xpath("//div[@id='mainView']/ul[@id='comicContain']//img")
    for i in range(0, len(imgs) - 1):
        if i == 1:  # 略過廣告圖片
            continue
        urls.append(imgs[i].get_attribute("src"))
        js = 'window.scrollTo( 800 ,' + str((i + 1) * 1280) + ')'
        browser.execute_script(js)
        time.sleep(randint(3, 6))

    browser.quit()
    print("urls=",urls)
    return urls

def getUrls(comic_url):
    result = dict()
    req = urllib2.Request(comic_url)
    req.add_header('User-Agent', user_agent)
    print("url=",comic_url)
    response = urllib2.urlopen(req)
    soup = BeautifulSoup(response, "lxml")
    #print("soup=",soup)
    # 返回最近漫畫中的最新20話
    page = soup.find(attrs={"class": "chapter-page-new works-chapter-list"}).find_all(
        "a")  # 全部漫畫 chapter-page-new works-chapter-list
    title = page[-1]['title']
    result[title] = ROOT_URL + page[-1]['href']
    print("title=",title)
    print("result=",result[title])
    return title,result[title]

def downloadComics(dir_path, urls):
    for url in urls:
        urllib2.urlretrieve(url, dir_path + url[-8:-2])
        #print("url=",url[-9:-2])

if __name__ == "__main__":
    title,result_url=getUrls(target_url[0])
    urls=getImageUrls(result_url)
    path=dir_path+title+"/"
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        print(path + '    創建成功')
    downloadComics(path,urls)

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM