最近無聊整理的爬蟲代碼,可以自動爬取騰訊動漫的任意漫畫,思路如下:
1. 先獲取想下載的動漫url, 這里用了 getUrls ,直接獲取動漫的最后一章
2. 然后進入到該動漫去獲取要下載的圖片url
3. 下載到本地
import os import random import time from random import randint from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver import DesiredCapabilities import urllib.request as urllib2 ROOT_URL = "http://ac.qq.com" target_url = [ ROOT_URL + "/Comic/comicInfo/id/505430", # 海賊王 ] ua_list = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" ] user_agent=random.choice(ua_list) dir_path="D:/py/海賊王/" def getImageUrls(comic_url): ''' 通過Selenium和Phantomjs獲取動態生成的數據 ''' urls = [] dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/4.0 (compatible; MSIE 5.5; windows NT)") browser = webdriver.PhantomJS(executable_path=r"E:\py\littlepy\tencent_cartoon\phantomjs-2.1.1-windows\bin\phantomjs.exe", desired_capabilities=dcap) browser.get(comic_url) imgs = browser.find_elements_by_xpath("//div[@id='mainView']/ul[@id='comicContain']//img") for i in range(0, len(imgs) - 1): if i == 1: # 略過廣告圖片 continue urls.append(imgs[i].get_attribute("src")) js = 'window.scrollTo( 800 ,' + str((i + 1) * 1280) + ')' browser.execute_script(js) time.sleep(randint(3, 6)) browser.quit() print("urls=",urls) return urls def getUrls(comic_url): result = dict() req = urllib2.Request(comic_url) req.add_header('User-Agent', user_agent) print("url=",comic_url) response = urllib2.urlopen(req) soup = BeautifulSoup(response, "lxml") #print("soup=",soup) # 返回最近漫畫中的最新20話 page = soup.find(attrs={"class": "chapter-page-new works-chapter-list"}).find_all( "a") # 全部漫畫 chapter-page-new works-chapter-list title = page[-1]['title'] result[title] = ROOT_URL + page[-1]['href'] print("title=",title) print("result=",result[title]) return title,result[title] def downloadComics(dir_path, urls): for url in urls: urllib2.urlretrieve(url, dir_path + url[-8:-2]) #print("url=",url[-9:-2]) if __name__ == "__main__": title,result_url=getUrls(target_url[0]) urls=getImageUrls(result_url) path=dir_path+title+"/" isExists = os.path.exists(path) if not isExists: os.makedirs(path) print(path + ' 創建成功') downloadComics(path,urls)