python_crawler,批量下載文件


這個第一個python3網絡爬蟲,參考書籍是《python網絡數據采集》。該爬蟲的主要功能是爬取某個網站,並將.rar,.doc,.docx,.zip文件批量下載。

后期將要改進的是,用后綴名來識別並下載文件,但面對大數據量的網站,需要用到BloomFilter,再者還需要了解網站的反爬蟲機制。

# -*- coding: utf-8 -*-

import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from urllib.parse import quote
import string

downloadDirectory = "downloaded"
baseUrl = "http://computer.hdu.edu.cn"
def is_chinese(uchar):
if uchar >= u'\u2E80' and uchar <= u'\uFE4F':
return True
else:
return False

def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+source
else:
url = baseUrl+source
if baseUrl not in url:
return None
return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory+path
directory = os.path.dirname(path)

if not os.path.exists(directory):
os.makedirs(directory)

print(path)
return path


pages = set()
def getLinks(pageUrl):
global pages
html = urlopen("http://computer.hdu.edu.cn"+pageUrl)
bsObj = BeautifulSoup(html, "html.parser")
try:
print(bsObj.h1.get_text())
print(bsObj.h2.get_text())
print(bsObj.h3.get_text())
# my_docs = bsObj.findAll("a", {"href":re.compile("\/uploads\/attachments\/.*\.doc")})
my_files = bsObj.findAll("a", {"href":re.compile("\/uploads\/attachments/")})

for my_file in my_files:
if is_chinese(my_file["href"]):
my_file["href"]=quote(my_file["href"])
print("τݾ"+my_file["href"])
url = getAbsoluteURL(baseUrl, my_file["href"])
# url="http://computer.hdu.edu.cn"+ my_file["href"]
print(url)
if url is not None:
# print(url)
# url=url.encode("utf-8")
# url=quote(url,safe=string.printable)
# url=quote(url)

# print(url)
urlretrieve(url, getDownloadPath(baseUrl, url, downloadDirectory))

# print(bsObj.find(id ="mw-content-text").findAll("p")[0])
# print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("This page is missing something! No worries though!")

for link in bsObj.findAll("a", href=re.compile("^(/index\.php/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
#We have encountered a new page
newPage = link.attrs['href']
print("----------------\n"+newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("")


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM