#!/usr/bin/python #-*-coding:utf-8-*- # 簡易采集爬蟲 # 1.采集Yahoo!Answers,parseData函數修改一下,可以采集任何網站 # 2.需要sqlite3或者pysqlite支持 # 3.可以在DreamHost.com空間上面運行 # 4.可以修改User-Agent冒充搜索引擎蜘蛛 # 5.可以設置暫停的時間,控制采集速度 # 6.采集Yahoo會被封IP數小時,所以這個采集用處不大 # Author: Lukin<mylukin@gmail.com> # Date : 2008-09-25 # 導入采集需要用到的模塊 import re, sys, time import httplib, os.path as osp from urlparse import urlparse # 使用sqite數據庫,為了兼容DreamHost.com的空間,只能這么寫了 try : import sqlite3 as sqlite except ImportError: from pysqlite2 import dbapi2 as sqlite # 采集速度控制,單位秒 sleep = 0 # 數據庫路徑 dbname = './database.db' # 設置提交的header頭 headers = {"Accept": "*/*","Referer": "http://answers.yahoo.com/","User-Agent": "Mozilla/5.0+(compatible;+Googlebot/2.1;++http://www.google.com/bot.html)"} # 連接服務器 dl = httplib.HTTPConnection('answers.yahoo.com') # 連接數據庫 conn = sqlite.connect(osp.abspath(dbname)) # 創建數據庫 def createDatabase(): global conn,dbname; if osp.isfile(osp.abspath(dbname)) : return c = conn.cursor() # 創建url列表存放表 c.execute('''CREATE TABLE IF NOT EXISTS [collect]([cid] INTEGER PRIMARY KEY,[curl] TEXT,[state] INTEGER DEFAULT '0',UNIQUE([curl]));''') c.execute('''CREATE INDEX IF NOT EXISTS [collect_idx_state] ON [collect]([state]);''') # 創建分類表 c.execute('''CREATE TABLE IF NOT EXISTS [sorts]([sortid] INTEGER PRIMARY KEY,[sortname] TEXT,[sortpath] TEXT,[sortfoot] INTEGER DEFAULT '0',[sortnum] INTEGER DEFAULT '0',UNIQUE([sortpath]));''') c.execute('''CREATE INDEX IF NOT EXISTS [sorts_idx_sortname] ON [sorts]([sortname]);''') c.execute('''CREATE INDEX IF NOT EXISTS [sorts_idx_sortfoot] ON [sorts]([sortfoot]);''') # 創建文章表 c.execute('''CREATE TABLE IF NOT EXISTS [article]([aid] INTEGER PRIMARY KEY,[sortid] INTEGER DEFAULT '0',[hits] INTEGER DEFAULT '0',[title] TEXT,[path] TEXT,[question] TEXT,[banswer] TEXT,[oanswer] TEXT,UNIQUE([path]));''') c.execute('''CREATE INDEX IF NOT EXISTS [article_idx_sortid] ON [article]([sortid]);''') # 事物提交 conn.commit() c.close() # 執行采集 def collect(url="http://answers.yahoo.com/"): global dl,error,headers; R = 0 print "GET:",url urls = urlparse(url); path = urls[2]; if urls[4]!='' : path += '?' + urls[4] dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "3 seconds, try again ..."; time.sleep(3) dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "3 seconds, try again ..."; time.sleep(3) dl.request(method="GET", url=path, headers=headers); rs = dl.getresponse() if rs.status==200 : R = parseData(rs.read(),url); else : print "Continue to collect ..." R = 3 # 更新記錄 updateOneUrl(url,R) # 返回結果 return R # 處理采集到的數據 def parseData(html,url): global dl,conn; R = 2; c = conn.cursor() # 格式化html代碼 format = formatURL(clearBlank(html),url) # 取出所有的連接 urls = re.findall(r'''(<a[^>]*?href="([^"]+)"[^>]*?>)|(<a[^>]*?href='([^']+)'[^>]*?>)''',format,re.I) if urls != None : i = 0 # 循環所有的連接 for regs in urls : # 得到一個單一的url sUrl = en2chr(regs[1].strip()) # 判斷url是否符合規則,符合,則插入數據庫 if re.search('http(.*?)/(dir|question)/index(.*?)',sUrl,re.I) != None : if re.search('http(.*?)/dir/index(.*?)',sUrl,re.I) != None: if sUrl.find('link=list') == -1 and sUrl.find('link=over') == -1 : sUrl+= '&link=over' else: sUrl = sUrl.replace('link=list','link=over') if sUrl[-11:]=='link=mailto' : continue try : c.execute('INSERT INTO [collect]([curl])VALUES(?);',(sUrl,)) i = i + 1 except sqlite.IntegrityError : pass if i>0 : print "Message: %d get a new URL." % (i,) # 截取數據 if re.search('http(.*)/question/index(.*)',url,re.I) != None : sortfoot = 0 # 自動創建分類和分類關系 guide = sect(format,'<ol id="yan-breadcrumbs">','</ol>','(<li>(.*?)Home(.*?)</li>)') aGuide = re.findall('<a[^>]*href="[^"]*"[^>]*>(.*?)</a>',guide,re.I) if aGuide != None : sortname = "" for sortname in aGuide : sortname = sortname.strip() sortpath = en2path(sortname) # 查詢分類是否存在 c.execute('SELECT [sortid],[sortname] FROM [sorts] WHERE [sortpath]=? LIMIT 0,1;',(sortpath,)) row = c.fetchone(); # 分類不存在,添加分類 if row==None : c.execute('INSERT INTO [sorts]([sortname],[sortpath],[sortfoot])VALUES(?,?,?);',(sortname,sortpath,sortfoot)) sortfoot = c.lastrowid else: sortfoot = row[0] # 標題 title = sect(format,'<h1 class="subject">','</h1>') # 最佳答案 BestAnswer = sect(format,'(<h2><span>Best Answer</span>(.*?)</h2>(.*?)<div class="content">)','(</div>)') # 最佳答案不存在,則不采集 if BestAnswer != None : # 文章路徑 path = en2path(sortname + '-' + title.strip()) # 問題 adddata = sect(format,'<div class="additional-details">','</div>') content = sect(format,'(<h1 class="subject">(.*?)<div class="content">)','(</div>)') if adddata != None : content += '<br/>' + adddata # 其他回答 OtherAnswer = '' for regs in re.findall('<div class="qa-container">(.+?)<div class="utils-container">',format): if regs.find('<h2>') == -1 and regs.find('</h2>') == -1 : a1 = sect(regs,'<div class="content">','</div>') a2 = sect(regs,'<div class="reference">','</div>') OtherAnswer+= '<div class="oAnswer">' + a1 if a2 != None : OtherAnswer+= '<div class="reference">' + a2 + '</div>' OtherAnswer+= '</div>' # 判斷采集成功 if title != None and content != None : # 將數據寫入到數據 try : c.execute('INSERT INTO [article]([sortid],[title],[path],[question],[banswer],[oanswer])VALUES(?,?,?,?,?,?);',(sortfoot,title,path,content,BestAnswer,OtherAnswer)) print "Message:%s.html" % (path,) R = 1 except sqlite.IntegrityError : pass # 提交寫入數據庫 conn.commit(); c.close() return R # 取得一條URL def getOneUrl(): global conn; c = conn.cursor() c.execute('SELECT [curl] FROM [collect] WHERE [state] IN(0,3) LIMIT 0,1;') row = c.fetchone(); c.close() if row==None : return "" return row[0].encode('utf-8') # 更新一條記錄的狀態 def updateOneUrl(url,state): global conn; c = conn.cursor() c.execute('UPDATE [collect] SET [state]=? WHERE [curl]=?;',(state,url)) conn.commit(); c.close() # 清除html代碼里的多余空格 def clearBlank(html): if len(html) == 0 : return '' html = re.sub('\r|\n|\t','',html) while html.find(" ")!=-1 or html.find(' ')!=-1 : html = html.replace(' ',' ').replace(' ',' ') return html # 格式化url def formatURL(html,url): urls = re.findall('''(<a[^>]*?href="([^"]+)"[^>]*?>)|(<a[^>]*?href='([^']+)'[^>]*?>)''',html,re.I) if urls == None : return html for regs in urls : html = html.replace(regs[0],matchURL(regs[0],url)) return html # 格式化單個url def matchURL(tag,url): urls = re.findall('''(.*)(src|href)=(.+?)( |/>|>).*|(.*)url\(([^\)]+)\)''',tag,re.I) if urls == None : return tag else : if urls[0][5] == '' : urlQuote = urls[0][2] else: urlQuote = urls[0][5] if len(urlQuote) > 0 : cUrl = re.sub('''['"]''','',urlQuote) else : return tag urls = urlparse(url); scheme = urls[0]; if scheme!='' : scheme+='://' host = urls[1]; host = scheme + host if len(host)==0 : return tag path = osp.dirname(urls[2]); if path=='/' : path = ''; if cUrl.find("#")!=-1 : cUrl = cUrl[:cUrl.find("#")] # 判斷類型 if re.search('''^(http|https|ftp):(//|\\\\)(([\w/\\\+\-~`@:%])+\.)+([\w/\\\.\=\?\+\-~`@':!%#]|(&)|&)+''',cUrl,re.I) != None : # http開頭的url類型要跳過 return tag elif cUrl[:1] == '/' : # 絕對路徑 cUrl = host + cUrl elif cUrl[:3]=='../' : # 相對路徑 while cUrl[:3]=='../' : cUrl = cUrl[3:] if len(path) > 0 : path = osp.dirname(path) elif cUrl[:2]=='./' : cUrl = host + path + cUrl[1:] elif cUrl.lower()[:7]=='mailto:' or cUrl.lower()[:11]=='javascript:' : return tag else : cUrl = host + path + '/' + cUrl R = tag.replace(urlQuote,'"' + cUrl + '"') return R # html代碼截取函數 def sect(html,start,end,cls=''): if len(html)==0 : return ; # 正則表達式截取 if start[:1]==chr(40) and start[-1:]==chr(41) and end[:1]==chr(40) and end[-1:]==chr(41) : reHTML = re.search(start + '(.*?)' + end,html,re.I) if reHTML == None : return reHTML = reHTML.group() intStart = re.search(start,reHTML,re.I).end() intEnd = re.search(end,reHTML,re.I).start() R = reHTML[intStart:intEnd] # 字符串截取 else : # 取得開始字符串的位置 intStart = html.lower().find(start.lower()) # 如果搜索不到開始字符串,則直接返回空 if intStart == -1 : return # 取得結束字符串的位置 intEnd = html[intStart+len(start):].lower().find(end.lower()) # 如果搜索不到結束字符串,也返回為空 if intEnd == -1 : return # 開始和結束字符串都有了,可以開始截取了 R = html[intStart+len(start):intStart+intEnd+len(start)] # 清理內容 if cls != '' : R = clear(R,cls) # 返回截取的字符 return R # 正則清除 def clear(html,regexs): if regexs == '' : return html for regex in regexs.split(chr(10)): regex = regex.strip() if regex != '' : if regex[:1]==chr(40) and regex[-1:]==chr(41): html = re.sub(regex,'',html,re.I|re.S) else : html = html.replace(regex,'') return html # 格式化為路徑 def en2path(enStr): return re.sub('[\W]+','-',en2chr(enStr),re.I|re.U).strip('-') # 替換實體為正常字符 def en2chr(enStr): return enStr.replace('&','&') # ------------------------------------- 開始執行程序 ------------------------------------------- # 首先創建數據庫 createDatabase() # 開始采集 loops = 0 while True: if loops>0 : url = getOneUrl() if url == "" : loops = 0 else : loops = collect(url) else : loops = collect() # 暫停 time.sleep(sleep) if loops==0 : break # 關閉HTTP連接 dl.close() # 退出程序 sys.exit()