通過幾天的學習與嘗試逐漸對python爬蟲有了一些小小的心得,我們漸漸發現他們有很多共性,總是要去獲取一系列的鏈接,讀取網頁代碼,獲取所需內容然后重復上面的工作,當自己運用的越來越熟練之后我們就會嘗試着去總結一下爬蟲的共性,試着去寫個helper類以避免重復性勞動。
1.訪問網站 #最簡單的得到網頁代碼的方法
1 import urllib2 2 response = urllib2.urlopen("http://www.xx.com") 3 print response.read()
2.偽裝成瀏覽器(User-Agent,Referer等) #為了不被服務器禁止訪問所以還是偽裝成瀏覽器比較好
1 headers = { 2 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 3 'Referer':'http://www.xx.com/xx', 4 'Accept':'application/javascript, */*;q=0.8' 5 } 6 response = urllib2.Request(url = "http://www.xx.com",data = None,headers = headers)
3.Post數據轉碼
1 import urllib,urllib2 2 values = { 3 'username':'xxx', 4 'password':'xxx', 5 'key':'xxx' 6 } 7 postdata = urllib.urlencode(values) 8 response = urllib2.Request(url,data = postdata)
4.Cookies
1 import urllib2,cookielib 2 cookie_handler = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) 3 opener = urllib2.build_opener(cookie_handler) 4 urllib2.install_opener(opener) 5 response = urllib2.urlopen(url)
5.代理服務器 #重復多次訪問同一網址 結果被封了ip或限制了訪問次數
1 import urllib2 2 proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'}) 3 opener = urllib2.build_opener(proxy_handler) 4 urllib2.install_opener(opener) 5 response = urllib2.urlopen(url)
問:如果想cookie和proxy一起用怎么辦?
答:urllib2.build_opener可以放多個參數,即handler 如:BaseHandler,ProxyHandler,HTTPHandler,FileHandler,FTPHandler,CacheFTPHandler等等等等
6.gzip #現在普遍支持gzip壓縮,我們默認獲取壓縮后的網頁,大大提高了抓取網頁的效率,減少了帶寬負荷。
1 import urllib2,zlib 2 req = urllib2.Request(url) 3 req.add_header('Accept-encoding', 'gzip') 4 response = urllib2.urlopen(req, timeout=120) 5 html = response.read() 6 gzipped = response.headers.get('Content-Encoding') 7 if gzipped: 8 html = zlib.decompress(html, 16+zlib.MAX_WBITS)
7.其他
設置線程棧大小:棧大小顯著影響python的內存占用,方法如下:
1 from threading import stack_size 2 stack_size(32768*16)
設置超時
1 import socket 2 socket.setdefaulttimeout(10) #設置10秒后連接超時
失敗后重試
1 def get(self,req,retries=3): 2 try: 3 response = self.opener.open(req) 4 data = response.read() 5 except Exception , what: 6 print what,req 7 if retries>0: 8 return self.get(req,retries-1) 9 else: 10 print 'GET Failed',req 11 return '' 12 return data
根據以上內容,我們可以寫出便於配置解決重復性工作的自己的helper類:

1 # -*- coding: utf-8 -*- 2 import cookielib, urllib, urllib2, socket 3 import zlib,StringIO 4 class HttpClient: 5 __cookie = cookielib.CookieJar() 6 #代理設置,需要時添加(后續設置為多代理切換) 7 #__proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'}) 8 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie))#,__proxy_handler) 9 __req.addheaders = [ 10 ('Accept', 'application/javascript, */*;q=0.8'), 11 ('User-Agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)') 12 ] 13 urllib2.install_opener(__req) 14 15 def Get(self, url, refer=None,retries=3): 16 try: 17 req = urllib2.Request(url) 18 req.add_header('Accept-encoding', 'gzip') 19 if not (refer is None): 20 req.add_header('Referer', refer) 21 response = urllib2.urlopen(req, timeout=120) 22 html = response.read() 23 gzipped = response.headers.get('Content-Encoding') 24 if gzipped: 25 html = zlib.decompress(html, 16+zlib.MAX_WBITS) 26 return html 27 except Exception,what: 28 print what 29 if retries>0: 30 return self.Get(url,refer,retries-1) 31 else: 32 print "Get Failed",url 33 return '' 34 #except urllib2.HTTPError, e: 35 # return e.read() 36 #except socket.timeout, e: 37 # return '' 38 #except socket.error, e: 39 # return '' 40 41 def Post(self, url, data, refer=None): 42 try: 43 req = urllib2.Request(url, urllib.urlencode(data)) 44 #req = urllib2.Request(url,data) 45 if not (refer is None): 46 req.add_header('Referer', refer) 47 return urllib2.urlopen(req, timeout=120).read() 48 except urllib2.HTTPError, e: 49 return e.read() 50 except socket.timeout, e: 51 return '' 52 except socket.error, e: 53 return '' 54 55 def Download(self, url, file): 56 output = open(file, 'wb') 57 output.write(urllib2.urlopen(url).read()) 58 output.close() 59 60 def getCookie(self, key): 61 for c in self.__cookie: 62 if c.name == key: 63 return c.value 64 return '' 65 66 def setCookie(self, key, val, domain): 67 ck = cookielib.Cookie(version=0, name=key, value=val, port=None, port_specified=False, domain=domain, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) 68 self.__cookie.set_cookie(ck)
至於多線程就參考網上找的這段代碼好了,還支持並發。。。

1 from threading import Thread 2 from Queue import Queue 3 from time import sleep 4 #q是任務隊列 5 #NUM是並發線程總數 6 #JOBS是有多少任務 7 q = Queue() 8 NUM = 2 9 JOBS = 10 10 #具體的處理函數,負責處理單個任務 11 def do_somthing_using(arguments): 12 print arguments 13 #這個是工作進程,負責不斷從隊列取數據並處理 14 def working(): 15 while True: 16 arguments = q.get() 17 do_somthing_using(arguments) 18 sleep(1) 19 q.task_done() 20 #fork NUM個線程等待隊列 21 for i in range(NUM): 22 t = Thread(target=working) 23 t.setDaemon(True) 24 t.start() 25 #把JOBS排入隊列 26 for i in range(JOBS): 27 q.put(i) 28 #等待所有JOBS完成 29 q.join()
爬蟲就靠一段落吧,更深入的爬蟲框架以及html解析庫暫時放一放,讓我考慮考慮接下來的內容,是pygame還是django!
爬蟲demo的github地址(剛學着玩git ):http://git.oschina.net/tabei/Python_spider