python爬蟲學習-爬取某個網站上的所有圖片


最近簡單地看了下python爬蟲的視頻。便自己嘗試寫了下爬蟲操作,計划的是把某一個網站上的美女圖全給爬下來,不過經過計算,查不多有好幾百G的樣子,還是算了。就首先下載一點點先看看。
本次爬蟲使用的是python2.7的版本,並且本次的目標網站並沒有采用js來加載圖片,所以沒有涉及對js腳本的解析,都是通過來分析html文件通過正則來一步步提取圖片網址,然后存起來。
首先這個網站有很多分類,到美女圖這個子網頁,可以發現有很多頁,同時每頁有多個相冊,每個相冊點進去就會有多個頁,每頁有多張照片
流程大概是這樣
找到所有頁數
----遍歷所有的頁數
----遍歷當前頁的所有相冊(給每個相冊建立一個目錄)
----遍歷當前相冊的所有圖片(遍歷此相冊的所有頁(遍歷當前頁的所有照片並找到圖片的url))
----獲得圖片url就存起來
不說了,直接上代碼
這個版本是windows上的運行版本

import urllib
import re
import os
import time
import socket
def get_html(url):
    socket.setdefaulttimeout(10)
    papg = urllib.urlopen(url)
    html = papg.read()
    html = unicode(html, "gbk").encode("utf8")
    return html

def get_img(html):
    imgre = re.compile(r'<img src="(.*?)"')
    imglist = re.findall(imgre, html)
    for imgurl in imglist:
        print imgurl
        global x
        urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
        x += 1
        print("正在下載第%s張圖片"%x)

def get_tag_list(html):
    szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
    tag_list = re.findall(szurlre, html)
    return tag_list
    
def get_page_num(html):
    szurlre = re.compile(r'(\d+).html\'>末頁')
    szresult = re.findall(szurlre, html)
    if len(szresult) == 0:
        page_num = 0
    else:
        page_num = int(szresult[0])
    print page_num
    return page_num

def get_page_num2(html):
    szurlre = re.compile(r'共(\d+)頁')
    szresult = re.findall(szurlre, html)
    if len(szresult) == 0:
        page_num = 0
    else:
        page_num = int(szresult[0])
    print page_num
    return page_num

#獲得單頁的相冊
def get_ablum_list(html):
    szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
    ablum_list = re.findall(szurlre, html);
    return ablum_list
#獲得相冊的名稱
def get_ablum_name(html):
    szurlre = re.compile(r'<title>(\S+)</title>')
    ablum_name = re.findall(szurlre, html)
    return ablum_name[0]
#獲得單頁的圖片
def get_photo(html, dir, photo_num):
    imgre = re.compile(r'點擊圖片進入下一頁\' ><img src=\'(http://\S+.jpg)\' alt=')
    imglist = re.findall(imgre, html)
    for imgurl in imglist:
        try:
            socket.setdefaulttimeout(2)
            urllib.urlretrieve(imgurl, unicode('.\\photo\\%s\%05d.jpg'%(dir, photo_num), "utf8"))
            print("正在下載第%s張圖片"%photo_num)
            photo_num = photo_num + 1
        except:
            continue
    return photo_num

url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s頁"%page_num)
ablum_num = 0
try:
    os.mkdir("photo")
except:
    print "目錄已經存在,繼續下載"
#遍歷所有的頁
for i in range(1, page_num):
    if i != 1:
        url = "http://www.5442.com/meinv/list_1_%s.html"%i
        try:
            html = get_html(url)
        except:
            continue
    ablum_list = get_ablum_list(html)
    #遍歷當前頁的所有相冊
    for ablum_url in ablum_list:
        ablum_num = ablum_num + 1
        try:
            photo_html = get_html(ablum_url)
        except:
            continue
        url_part = ablum_url[0:-5]
        photo_page_num = get_page_num2(photo_html)
        #獲取相冊名有點問題,直接以數字來創建更加方便,便於分
        #ablum_name = get_ablum_name(photo_html)
        ablum_name = "編程資料" + "%05d" % ablum_num
        print ablum_name
        photo_num = 0
        #創建相冊對應的目錄
        ui_ablum_name = unicode(ablum_name, "utf8")
        try:
            os.mkdir(".\\photo\\"+ui_ablum_name)
        except:
            continue
        for i in range(1, photo_page_num):
            if i != 1:
                ablum_url = url_part + "_%d"%i + ".html"
                try:
                    photo_html = get_html(ablum_url)
                except:
                    continue
            #進行存儲操作
            photo_num = get_photo(photo_html, ablum_name, photo_num)

運行效果截圖:

這樣就運行成功了。

以下是linux下的運行代碼,主要是編碼和存儲的路徑格式不一樣

#!/usr/bin/python
# -*- coding:utf8 -*-

import urllib
import re
import os
import time
import socket
def get_html(url):
    socket.setdefaulttimeout(2)
    papg = urllib.urlopen(url)
    html = papg.read()
    html = unicode(html, "gbk").encode("utf8")
    return html

def get_img(html):
    imgre = re.compile(r'<img src="(.*?)"')
    imglist = re.findall(imgre, html)
    for imgurl in imglist:
        print imgurl
        global x
        urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
        x += 1
        print("正在下載第%s張圖片"%x)

def get_tag_list(html):
    szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
    tag_list = re.findall(szurlre, html)
    return tag_list

def get_page_num(html):
    szurlre = re.compile(r'(\d+).html\'>末頁')
    szresult = re.findall(szurlre, html)
    if len(szresult) == 0:
        page_num = 0
    else:
        page_num = int(szresult[0])
    print page_num
    return page_num

def get_page_num2(html):
    szurlre = re.compile(r'共(\d+)頁')
    szresult = re.findall(szurlre, html)
    if len(szresult) == 0:
        page_num = 0
    else:
        page_num = int(szresult[0])
    print page_num
    return page_num

#獲得單頁的相冊
def get_ablum_list(html):
    szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
    ablum_list = re.findall(szurlre, html);
    return ablum_list
#獲得相冊的名稱
def get_ablum_name(html):
    szurlre = re.compile(r'<title>(\S+)</title>')
    ablum_name = re.findall(szurlre, html)
    return ablum_name[0]
#獲得單頁的圖片
def get_photo(html, dir, photo_num):
    imgre = re.compile(r'點擊圖片進入下一頁\' ><img src=\'(http://\S+.jpg)\' alt=')
    imglist = re.findall(imgre, html)
    for imgurl in imglist:
        try:
            socket.setdefaulttimeout(2)
            urllib.urlretrieve(imgurl, './photo//%s//%05d.jpg'%(dir, photo_num))
            print("正在下載第%s張圖片"%photo_num)
            photo_num = photo_num + 1
        except:
            continue
    return photo_num

url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s頁"%page_num)
ablum_num = 0
try:
    os.mkdir("./photo")
except:
    print "目錄已經存在"
for i in range(1, page_num):
    if i != 1:
        url = "http://www.5442.com/meinv/list_1_%s.html"%i
        try:
            html = get_html(url)
        except:
            continue
    ablum_list = get_ablum_list(html)
    for ablum_url in ablum_list:
        ablum_num = ablum_num + 1
        try:
            photo_html = get_html(ablum_url)
        except:
            continue
        url_part = ablum_url[0:-5]
        photo_page_num = get_page_num2(photo_html)
        ablum_name = "編程資料" + "%05d" % ablum_num
        print ablum_name
        photo_num = 0
        #創建相冊對應的目錄
        ui_ablum_name = ablum_name
        try:
            os.mkdir("./photo/"+ui_ablum_name)
        except:
            continue
        for i in range(1, photo_page_num):
            if i != 1:
                ablum_url = url_part + "_%d"%i + ".html"
                try:
                  photo_html = get_html(ablum_url)
                except:
                  continue
            photo_num = get_photo(photo_html, ablum_name, photo_num)

運行效果:

保存目錄


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM