使用python編寫一個壁紙網站的簡單爬蟲


目標網站:http://www.netbian.com/

目的:實現對壁紙各分類的第一頁壁紙的獲取

 一:分析網站,編寫代碼:

(ps:源代碼在文章的最后)

1.獲取網站目錄部分的一大段代碼,下一步再進行仔細匹配網址與標題.

 1 #coding=gbk
 2 #目標:下載各目錄的壁紙(大圖)
 3 __author__ = 'CQC'
 4 import urllib2
 5 import urllib
 6 import re
 7 import os
 8 
 9 #創建壁紙下載文件夾
10 path = 'd:\\彼岸壁紙'
11 if not os.path.isdir(path):
12     os.makedirs(path)
13 #目錄
14 big_title = []
15 
16 #首頁打開
17 url = 'http://www.netbian.com/' 
18 headers = {'User-agent' : 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
19 request = urllib2.Request(url,headers = headers)
20 response = urllib2.urlopen(request)
21 
22 #首頁目錄源代碼獲取
23 pat_menu = re.compile('<ul class="menu">(.*?)</a></div>',re.S)
24 code_menu = re.search(pat_menu,response.read())

 如圖:

2.進行分類的標題與鏈接的匹配。

 1 #目錄標題
 2 pat_menu_title = re.compile('<a href=".*?" title="(.*?)">',re.S)
 3 menu_title = re.findall(pat_menu_title,code_menu.group(1))
 4 for a_item in menu_title:
 5     big_title.append(a_item)
 6     print a_item
 7     
 8 #目錄鏈接
 9 pat_menu_link = re.compile('<a href="(.*?)" title=".*?">',re.S)
10 menu_link = re.findall(pat_menu_link,code_menu.group(1))

 如下圖所示:

3.從爬取到的目錄進入,獲得該目錄下所有壁紙的標題與鏈接.

 1 #進入目錄
 2 j = 0
 3 for b_item in menu_link:
 4     url_menu = 'http://www.netbian.com/' + b_item
 5     request_son = urllib2.Request(url_menu,headers = headers)
 6     response_son = urllib2.urlopen(request_son)
 7     #獲得每個目錄的圖片標題,鏈接
 8     
 9     #獲得子目錄標題
10     title_son = []
11     pat_title_son = re.compile('<img src=".*?" data-src=".*?" alt="(.*?)"/>',re.S)
12     res_title = re.findall(pat_title_son,response_son.read())
13     for c_item in res_title:
14         title_son.append(c_item)
15 
16     #篩選出子目錄代碼
17     pat_code_son = re.compile('<ul>(.*?)</ul>',re.S)
18     middle_pattern = urllib2.Request(url_menu,headers = headers)
19     middle_response = urllib2.urlopen(middle_pattern)
20     res_code_son = re.search(pat_code_son,middle_response.read())
21     
22     #獲得子目錄鏈接,合成大圖網頁鏈接
23     pat_link_son = re.compile('<li><a href="(.*?)" target="_blank"><img',re.S)
24     res_link = re.findall(pat_link_son,res_code_son.group(1))

 如下圖所示:

4.根據上一步爬取到的鏈接,合成真正的1080p壁紙鏈接.

因為我們從上圖標題點進去后是這樣:

還需要點擊下載按鈕才能打開1080p壁紙的鏈接。為了方便,我們直接合成1080p壁紙的鏈接.

例如: http://www.netbian.com/desk/9805.htm

對應的1080p網址:http://www.netbian.com/desk/9805-1920x1080.htm

代碼:

 1     i = 0
 2     #顯示進度
 3     print big_title[j]
 4     for d_item in res_link:
 5         #獲得大圖下載鏈接
 6         if d_item == 'http://www.mmmwu.com/':
 7             pass
 8         else:
 9             new_link = 'http://www.netbian.com/' + d_item[:-4] + '-1920x1080.htm'
10             print new_link

(ps:由於‘美女’分類中的第一個標題鏈接到了其他網站,為了簡單一點,所以我直接跳過了)

5.進入1080p壁紙鏈接,下載壁紙.

 1 request_real = urllib2.Request(new_link,headers = headers)
 2             response_real = urllib2.urlopen(request_real)
 3             pat_real = re.compile('<img src="(.*?)" alt=".*?"/></td></tr>')
 4             
 5             link_real = re.search(pat_real,response_real.read())
 6             #跳過vip壁紙
 7             if link_real:
 8                 fina_link = link_real.group(1)
 9                 #創建下載目錄
10                 path_final = 'd:\\彼岸壁紙\\' + big_title[j] + '\\'
11                 if not os.path.isdir(path_final):
12                     os.makedirs(path_final)
13                 path_pic = path_final + title_son[i] + '.jpg'
14                 f = open(path_pic,'wb')
15                 data = urllib.urlopen(fina_link)
16                 f.write(data.read())
17                 f.close()
18                 if not data:
19                     print "Download Failed."
20             i += 1
21     print 'One menu download OK.'
22     j += 1

6.下載完成.

二、所有的源代碼。

 1 #coding=gbk
 2 #目標:下載各目錄的壁紙(大圖)
 3 __author__ = 'CQC'
 4 import urllib2
 5 import urllib
 6 import re
 7 import os
 8 
 9 #創建壁紙下載文件夾
10 path = 'd:\\彼岸壁紙'
11 if not os.path.isdir(path):
12     os.makedirs(path)
13 #目錄
14 big_title = []
15 
16 #首頁打開
17 url = 'http://www.netbian.com/' 
18 headers = {'User-agent' : 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}
19 request = urllib2.Request(url,headers = headers)
20 response = urllib2.urlopen(request)
21 
22 #首頁目錄源代碼獲取
23 pat_menu = re.compile('<ul class="menu">(.*?)</a></div>',re.S)
24 code_menu = re.search(pat_menu,response.read())
25 
26 #目錄標題
27 pat_menu_title = re.compile('<a href=".*?" title="(.*?)">',re.S)
28 menu_title = re.findall(pat_menu_title,code_menu.group(1))
29 for a_item in menu_title:
30     big_title.append(a_item)
31     print a_item
32     
33 #目錄鏈接
34 pat_menu_link = re.compile('<a href="(.*?)" title=".*?">',re.S)
35 menu_link = re.findall(pat_menu_link,code_menu.group(1))
36 
37 #進入目錄
38 j = 0
39 for b_item in menu_link:
40     url_menu = 'http://www.netbian.com/' + b_item
41     request_son = urllib2.Request(url_menu,headers = headers)
42     response_son = urllib2.urlopen(request_son)
43     #獲得每個目錄的圖片標題,鏈接
44     
45     #獲得子目錄標題
46     title_son = []
47     pat_title_son = re.compile('<img src=".*?" data-src=".*?" alt="(.*?)"/>',re.S)
48     res_title = re.findall(pat_title_son,response_son.read())
49     for c_item in res_title:
50         title_son.append(c_item)
51 
52     #篩選出子目錄代碼
53     pat_code_son = re.compile('<ul>(.*?)</ul>',re.S)
54     middle_pattern = urllib2.Request(url_menu,headers = headers)
55     middle_response = urllib2.urlopen(middle_pattern)
56     res_code_son = re.search(pat_code_son,middle_response.read())
57     
58     #獲得子目錄鏈接,合成大圖網頁鏈接
59     pat_link_son = re.compile('<li><a href="(.*?)" target="_blank"><img',re.S)
60     res_link = re.findall(pat_link_son,res_code_son.group(1))
61     i = 0
62     #顯示進度
63     print big_title[j]
64     for d_item in res_link:
65         #獲得大圖下載鏈接
66         if d_item == 'http://www.mmmwu.com/':
67             pass
68         else:
69             new_link = 'http://www.netbian.com/' + d_item[:-4] + '-1920x1080.htm'
70             print new_link
71             request_real = urllib2.Request(new_link,headers = headers)
72             response_real = urllib2.urlopen(request_real)
73             pat_real = re.compile('<img src="(.*?)" alt=".*?"/></td></tr>')
74             
75             link_real = re.search(pat_real,response_real.read())
76             #跳過vip壁紙
77             if link_real:
78                 fina_link = link_real.group(1)
79                 #創建下載目錄
80                 path_final = 'd:\\彼岸壁紙\\' + big_title[j] + '\\'
81                 if not os.path.isdir(path_final):
82                     os.makedirs(path_final)
83                 path_pic = path_final + title_son[i] + '.jpg'
84                 f = open(path_pic,'wb')
85                 data = urllib.urlopen(fina_link)
86                 f.write(data.read())
87                 f.close()
88                 if not data:
89                     print "Download Failed."
90             i += 1
91     print 'One menu download OK.'
92     j += 1

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM