工作中需要判断某个文本中的URL是否能正常访问,并且随机获取其中N行能正常访问的URL数据,我的思路是:读取文本每一行数据,用urlopen访问,将返回状态码为200的URL保存到一个列表,获得列表长度,使用random产生一个随机值作为列表下标,获取该行数据。具体实现如下:
1 import urllib2,random 2 from sets import Set 3 4 def get_responses(url): 5 global good_list 6 global bad_list 7 if not url.startswith("http:"): 8 http_url = "http://" + url 9 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',} 10 try: 11 request = urllib2.Request(http_url, headers=headers) 12 resp = urllib2.urlopen(request) 13 print url 14 except urllib2.URLError, e: 15 print e 16 bad_list.append(url) 17 return 0 18 19 retcode = resp.getcode() 20 if retcode == 200: 21 good_list.append(url) 22 #return 1 23 else: 24 bad_list.append(url) 25 #return 0 26 27 def readFile(): 28 try: 29 urllist = open(r'C:\Users\888\Desktop\urls.txt','r') 30 except IOError: 31 print "file does not exist.\n" 32 for item in urllist: 33 item = item.strip('\n') 34 r = get_responses(item) 35 36 urllist.close() 37 print "Total URLs: %d, Good URLs:%d, Bad URLs: %d." %((len(good_list)+len(bad_list)),len(good_list),len(bad_list)) 38 39 def writeFile(linenum): 40 result = [] 41 linelen = len(good_list) 42 while len(result) < int(linenum): 43 s = random.randint(0,linelen-1) 44 result.append(good_list[s]) 45 result = list(Set(result)) 46 47 # Put the good_url in goodurl.txt file 48 try: 49 goodurl = open(r'C:\Users\888\Desktop\goodurl.txt','w+') 50 except IOError: 51 print "file does not exist.\n" 52 53 for item in result: 54 goodurl.write(item+'\n') 55 goodurl.close() 56 57 print "The mission is done, Please check the goodurl.txt file" 58 59 if __name__ == "__main__": 60 good_list = [] 61 bad_list = [] 62 readFile() 63 writeFile(150)