最近無聊整理的爬蟲代碼,可以自動爬取騰訊動漫的任意漫畫,思路如下:
1. 先獲取想下載的動漫url, 這里用了 getUrls ,直接獲取動漫的最后一章
2. 然后進入到該動漫去獲取要下載的圖片url
3. 下載到本地
import os
import random
import time
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
import urllib.request as urllib2
ROOT_URL = "http://ac.qq.com"
target_url = [
ROOT_URL + "/Comic/comicInfo/id/505430", # 海賊王
]
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
user_agent=random.choice(ua_list)
dir_path="D:/py/海賊王/"
def getImageUrls(comic_url):
'''
通過Selenium和Phantomjs獲取動態生成的數據
'''
urls = []
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/4.0 (compatible; MSIE 5.5; windows NT)")
browser = webdriver.PhantomJS(executable_path=r"E:\py\littlepy\tencent_cartoon\phantomjs-2.1.1-windows\bin\phantomjs.exe",
desired_capabilities=dcap)
browser.get(comic_url)
imgs = browser.find_elements_by_xpath("//div[@id='mainView']/ul[@id='comicContain']//img")
for i in range(0, len(imgs) - 1):
if i == 1: # 略過廣告圖片
continue
urls.append(imgs[i].get_attribute("src"))
js = 'window.scrollTo( 800 ,' + str((i + 1) * 1280) + ')'
browser.execute_script(js)
time.sleep(randint(3, 6))
browser.quit()
print("urls=",urls)
return urls
def getUrls(comic_url):
result = dict()
req = urllib2.Request(comic_url)
req.add_header('User-Agent', user_agent)
print("url=",comic_url)
response = urllib2.urlopen(req)
soup = BeautifulSoup(response, "lxml")
#print("soup=",soup)
# 返回最近漫畫中的最新20話
page = soup.find(attrs={"class": "chapter-page-new works-chapter-list"}).find_all(
"a") # 全部漫畫 chapter-page-new works-chapter-list
title = page[-1]['title']
result[title] = ROOT_URL + page[-1]['href']
print("title=",title)
print("result=",result[title])
return title,result[title]
def downloadComics(dir_path, urls):
for url in urls:
urllib2.urlretrieve(url, dir_path + url[-8:-2])
#print("url=",url[-9:-2])
if __name__ == "__main__":
title,result_url=getUrls(target_url[0])
urls=getImageUrls(result_url)
path=dir_path+title+"/"
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
print(path + ' 創建成功')
downloadComics(path,urls)
