#!/usr/bin/python
#-*- coding:utf-8 -*-
'''
此腳本主要實現網頁的點擊量,除了實現次功能點外,還有三個知識點:
1、隨機獲取代理ip,通過代理ip訪問指定站點,其目的是防止ip被封
2、訪問一個頁面后,隨機休息幾秒,再訪問,其目的是防止網站前面有4-7層過濾設備攔截
3、修改http的user agent字段,有些網站和4-7層設備會檢查
'''
import urllib2,re,time,urllib,random,user_agents
PROXYIPURL = 'http://www.goodips.com/?ip=&port=&dengji=&adr=%E7%94%B5%E4%BF%A1&checktime=&sleep=1%E7%A7%92%E5%86%85&cunhuo=48%E5%B0%8F%E6%97%B6%E4%BB%A5%E4%B8%8A&px='
class getProxyIP:
# 從網頁抓去代理ip ,並整理格式
def getProxyHtml(self):
# 抓去代理 ip頁面的代碼
page = urllib.urlopen(PROXYIPURL)
html = page.read()
#print html
return html
def ipPortRe(self):
# 從頁面代碼中取出代理 ip和端口
html = self.getProxyHtml()
#ip_re = re.compile(r'(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?))')
ip_re = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).+\n.+>(\d{1,5})<')
ip_port = re.findall(ip_re,html)
return ip_port
def proxyIP(self):
# 格式化輸出代理 ip和端口
ip_port = self.ipPortRe()
# 將代理 ip整理成['221.238.28.158:8081', '183.62.62.188:9999']格式
proxyIP = []
for i in range(0,len(ip_port)):
proxyIP.append(':'.join(ip_port[i]))
# 將代理 ip整理成[{'http': 'http://221.238.28.158:8081'}, {'http': 'http://183.62.62.188:9999'}]格式
proxy_list = []
for i in range(0,len(proxyIP)):
a0 = 'http://%s'%proxyIP[i]
a1 = {'http':'%s'%a0}
proxy_list.append(a1)
return proxy_list
def getHtml(url):
p = getProxyIP()
proxy_list = p.proxyIP()
proxy_ip =random.choice(proxy_list) #在proxy_list中隨機取一個ip
print proxy_ip
proxy_support = urllib2.ProxyHandler(proxy_ip)
opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
request = urllib2.Request(url)
user_agent = random.choice(user_agents.user_agents) #在user_agents中隨機取一個做user_agent
request.add_header('User-Agent',user_agent) #修改user-Agent字段
print user_agent
html = urllib2.urlopen(request).read()
print proxy_ip
return proxy_ip
URLS = ['http://www.x'x'xxw.net/study.asp?vip=',
'http://www.x'x'x'x'x'x.com/?fromuid=16',
]
count_True,count_False,count= 0,0,0
while True:
for url in URLS:
count +=1
try:
proxy_ip=getHtml(url)
except urllib2.URLError:
#print 'URLError! The bad proxy is %s' %proxy_ip
count_False += 1
except urllib2.HTTPError:
#print 'HTTPError! The bad proxy is %s' %proxy_ip
count_False += 1
except:
#print 'Unknown Errors! The bad proxy is %s ' %proxy_ip
count_False += 1
randomTime = random.uniform(1,3) #取1-10之間的隨機浮點數
time.sleep(randomTime) #隨機等待時間
print '%d Eroors,%d ok,總數 %d' %(count_False,count - count_False,count)
1 #!/usr/bin/python
2 #-*- coding:utf-8 -*-
3 '''
4 Created on 2013-7-14
5
6 @author: Administrator
7 '''
8
9 user_agents = [
10 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
11 'Opera/9.25 (Windows NT 5.1; U; en)',
12 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
13 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
14 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
15 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
16 ]
View Code