最近簡單地看了下python爬蟲的視頻。便自己嘗試寫了下爬蟲操作,計划的是把某一個網站上的美女圖全給爬下來,不過經過計算,查不多有好幾百G的樣子,還是算了。就首先下載一點點先看看。
本次爬蟲使用的是python2.7的版本,並且本次的目標網站並沒有采用js來加載圖片,所以沒有涉及對js腳本的解析,都是通過來分析html文件通過正則來一步步提取圖片網址,然后存起來。
首先這個網站有很多分類,到美女圖這個子網頁,可以發現有很多頁,同時每頁有多個相冊,每個相冊點進去就會有多個頁,每頁有多張照片
流程大概是這樣
找到所有頁數
----遍歷所有的頁數
----遍歷當前頁的所有相冊(給每個相冊建立一個目錄)
----遍歷當前相冊的所有圖片(遍歷此相冊的所有頁(遍歷當前頁的所有照片並找到圖片的url))
----獲得圖片url就存起來
不說了,直接上代碼
這個版本是windows上的運行版本
import urllib
import re
import os
import time
import socket
def get_html(url):
socket.setdefaulttimeout(10)
papg = urllib.urlopen(url)
html = papg.read()
html = unicode(html, "gbk").encode("utf8")
return html
def get_img(html):
imgre = re.compile(r'<img src="(.*?)"')
imglist = re.findall(imgre, html)
for imgurl in imglist:
print imgurl
global x
urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
x += 1
print("正在下載第%s張圖片"%x)
def get_tag_list(html):
szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
tag_list = re.findall(szurlre, html)
return tag_list
def get_page_num(html):
szurlre = re.compile(r'(\d+).html\'>末頁')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num
def get_page_num2(html):
szurlre = re.compile(r'共(\d+)頁')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num
#獲得單頁的相冊
def get_ablum_list(html):
szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
ablum_list = re.findall(szurlre, html);
return ablum_list
#獲得相冊的名稱
def get_ablum_name(html):
szurlre = re.compile(r'<title>(\S+)</title>')
ablum_name = re.findall(szurlre, html)
return ablum_name[0]
#獲得單頁的圖片
def get_photo(html, dir, photo_num):
imgre = re.compile(r'點擊圖片進入下一頁\' ><img src=\'(http://\S+.jpg)\' alt=')
imglist = re.findall(imgre, html)
for imgurl in imglist:
try:
socket.setdefaulttimeout(2)
urllib.urlretrieve(imgurl, unicode('.\\photo\\%s\%05d.jpg'%(dir, photo_num), "utf8"))
print("正在下載第%s張圖片"%photo_num)
photo_num = photo_num + 1
except:
continue
return photo_num
url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s頁"%page_num)
ablum_num = 0
try:
os.mkdir("photo")
except:
print "目錄已經存在,繼續下載"
#遍歷所有的頁
for i in range(1, page_num):
if i != 1:
url = "http://www.5442.com/meinv/list_1_%s.html"%i
try:
html = get_html(url)
except:
continue
ablum_list = get_ablum_list(html)
#遍歷當前頁的所有相冊
for ablum_url in ablum_list:
ablum_num = ablum_num + 1
try:
photo_html = get_html(ablum_url)
except:
continue
url_part = ablum_url[0:-5]
photo_page_num = get_page_num2(photo_html)
#獲取相冊名有點問題,直接以數字來創建更加方便,便於分
#ablum_name = get_ablum_name(photo_html)
ablum_name = "編程資料" + "%05d" % ablum_num
print ablum_name
photo_num = 0
#創建相冊對應的目錄
ui_ablum_name = unicode(ablum_name, "utf8")
try:
os.mkdir(".\\photo\\"+ui_ablum_name)
except:
continue
for i in range(1, photo_page_num):
if i != 1:
ablum_url = url_part + "_%d"%i + ".html"
try:
photo_html = get_html(ablum_url)
except:
continue
#進行存儲操作
photo_num = get_photo(photo_html, ablum_name, photo_num)
運行效果截圖:
這樣就運行成功了。
以下是linux下的運行代碼,主要是編碼和存儲的路徑格式不一樣
#!/usr/bin/python
# -*- coding:utf8 -*-
import urllib
import re
import os
import time
import socket
def get_html(url):
socket.setdefaulttimeout(2)
papg = urllib.urlopen(url)
html = papg.read()
html = unicode(html, "gbk").encode("utf8")
return html
def get_img(html):
imgre = re.compile(r'<img src="(.*?)"')
imglist = re.findall(imgre, html)
for imgurl in imglist:
print imgurl
global x
urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
x += 1
print("正在下載第%s張圖片"%x)
def get_tag_list(html):
szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
tag_list = re.findall(szurlre, html)
return tag_list
def get_page_num(html):
szurlre = re.compile(r'(\d+).html\'>末頁')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num
def get_page_num2(html):
szurlre = re.compile(r'共(\d+)頁')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num
#獲得單頁的相冊
def get_ablum_list(html):
szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
ablum_list = re.findall(szurlre, html);
return ablum_list
#獲得相冊的名稱
def get_ablum_name(html):
szurlre = re.compile(r'<title>(\S+)</title>')
ablum_name = re.findall(szurlre, html)
return ablum_name[0]
#獲得單頁的圖片
def get_photo(html, dir, photo_num):
imgre = re.compile(r'點擊圖片進入下一頁\' ><img src=\'(http://\S+.jpg)\' alt=')
imglist = re.findall(imgre, html)
for imgurl in imglist:
try:
socket.setdefaulttimeout(2)
urllib.urlretrieve(imgurl, './photo//%s//%05d.jpg'%(dir, photo_num))
print("正在下載第%s張圖片"%photo_num)
photo_num = photo_num + 1
except:
continue
return photo_num
url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s頁"%page_num)
ablum_num = 0
try:
os.mkdir("./photo")
except:
print "目錄已經存在"
for i in range(1, page_num):
if i != 1:
url = "http://www.5442.com/meinv/list_1_%s.html"%i
try:
html = get_html(url)
except:
continue
ablum_list = get_ablum_list(html)
for ablum_url in ablum_list:
ablum_num = ablum_num + 1
try:
photo_html = get_html(ablum_url)
except:
continue
url_part = ablum_url[0:-5]
photo_page_num = get_page_num2(photo_html)
ablum_name = "編程資料" + "%05d" % ablum_num
print ablum_name
photo_num = 0
#創建相冊對應的目錄
ui_ablum_name = ablum_name
try:
os.mkdir("./photo/"+ui_ablum_name)
except:
continue
for i in range(1, photo_page_num):
if i != 1:
ablum_url = url_part + "_%d"%i + ".html"
try:
photo_html = get_html(ablum_url)
except:
continue
photo_num = get_photo(photo_html, ablum_name, photo_num)
運行效果:
保存目錄