python實現網絡爬蟲下載天涯論壇帖子


最近發現天涯論壇是一個挺有意思的網站,有各種亂七八糟的帖子足以填補無聊時候的空虛感,但是相當不爽的一件事就是天涯的分頁模式下想連貫的把樓主的內容看完實在是太心酸了,一個999頁的帖子,百分之九十都是無聊網友的灌水,有時候連續翻幾十頁才能找到樓主的一條內容。所以無聊之下,就打算寫一個簡單的爬蟲,能一次性把某一個帖子下樓主的所有內容一次性的下載下來。好吧,說了這么多廢話,現在開始講點正事。

網頁的地址形式:http://bbs.tianya.cn/post-no05-355576-1.shtml,其中.shtml前的1表示這是當前帖子的第一頁,我們可以根據第一頁的內容解析出最大的頁數,然后遍歷的去解析每一個頁面,獲得樓主的全部言論。

網頁的源碼簡單如下圖,每一塊內容都放在atl-content這個div中,可以根據下面的一個注釋來判斷是不是樓主的發言,而正文內容放在bbs-content這個div中,如果有圖片的話,里面會有圖片的鏈接,實現的過程中我就是根據這兩點找到樓主的言論,並把內容提取出來。

 

 為了爬取的高效性,實現的過程中我利用了python的threading模塊,下面是threads.py模塊,定義了下載解析頁面的線程,下載圖片的線程以及線程池

import threading
import urllib2
import Queue
import re

thread_lock = threading.RLock()

#下載頁面的一個函數,header中沒有任何內容也可以順利的下載,就省去了
def download_page(html_url):
    try:
        req = urllib2.Request(html_url)
        response = urllib2.urlopen(req)
        page = response.read()
        return page
    except Exception:
        print 'download %s failed' % html_url
        return None

#下載圖片的一個方法,和上面的函數很像,只不過添加了一個文件頭
#因為在測試的過程中發現天涯對於沒有如下文件頭的圖片鏈接是不會返回正確的圖片的
def download_image(image_url, referer):
    try:
        req = urllib2.Request(image_url)
        req.add_header('Host', 'img3.laibafile.cn')
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0')
        req.add_header('Accept', 'image/png,image/*;q=0.8,*/*;q=0.5')
        req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
        req.add_header('Referer', referer)
        req.add_header('Origin', 'http://bbs.tianya.cn')
        req.add_header('Connection', 'keep-alive')
        response = urllib2.urlopen(req)
        image = response.read()
        return image
    except Exception:
        print 'download %s failed' % image_url
        return None

#下載和解析一個頁面的線程類
class download_html_page(threading.Thread):
    #name:線程的名字
    #page_range:用戶輸入的頁面范圍
    #page_contents:解析之后樓主的內容
    #img_urls:解析之后樓主貼的圖的鏈接
    #html_url:輸入的頁面url
    #first_page:第一次已經下載好的頁面,主要是考慮效率,不重復下載
    def __init__(self, name, page_range, page_contents, img_urls, html_url, first_page):
        threading.Thread.__init__(self)
        self.name = name
        self.page_range = page_range
        self.page_contents = page_contents
        self.img_urls = img_urls

        self.html_url = html_url
        self.first_page = first_page
    
    #判斷是不是樓主的內容
    def is_louzhu(self, s):
        result = re.search(r'<!-- <div class="host-ico">(.*?)</div> -->', s, re.S)
        return (result is not None)

    #獲得頁面里屬於樓主圖片的url
    def get_img_url(self, s, page_url):
        #判斷是不是樓主給其他用戶的評論,如果是的話,直接過濾掉(本人從不看評論)
        is_louzhu_answer = re.search(r'-{15,}<br>', s, re.S)
        if is_louzhu_answer is None:
            imgurl = re.findall(r'<img.*?original="(?P<imgurl>.*?)".*?/><br>', s, flags = re.S)

            url_path = []
            for one_url in imgurl:
                self.img_urls.put(one_url + '|' + page_url)
                path = re.search('\w+\.jpg', one_url).group(0)
                url_path.append('img/' + path)

            segments = re.split(r'<img .*?/><br>', s.strip())
            content = segments[0].strip()
            for i in range(len(url_path)):
                content += '\n<img src = "' + url_path[i] + '" />\n<br>'
                content += segments[i+1].strip()
            return content

    #解析夜歌頁面
    def parse_page(self, html_page, page_url):
        html_page.decode('utf-8')
        Items = re.findall(r'<div class="atl-content">(?P<islouzhu>.+?)<div class="bbs-content.*?">(?P<content>.+?)</div>', html_page, re.S)
        page_content = ''

        for item in Items:
            if self.is_louzhu(item[0]):
                one_div = self.get_img_url(item[1], page_url)
                if one_div is not None:
                    page_content += one_div
        return page_content

    def run(self):
        while self.page_range.qsize() > 0:
            page_number = self.page_range.get()
            page_url = re.sub('-(\d+?)\.shtml', '-' + str(page_number) + '.shtml', self.html_url)

            page_content = ''
            print 'thread %s is downloading %s' % (self.name, page_url)
            if page_url == self.html_url:
                page_content = self.parse_page(self.first_page, page_url)
            else:
                page = download_page(page_url)
                if page is not None:
                    page_content = self.parse_page(page, page_url)
            #thread_lock.acquire()
            #self.page_contents[page_number] = page_content
            #thread_lock.release()
            self.page_contents.put(page_content, page_number)
        self.img_urls.put('finished')

#下載圖片的線程
class fetch_img(threading.Thread):
    def __init__(self, name, img_urls, download_img):
        threading.Thread.__init__(self)
        self.name = name
        self.img_urls = img_urls
        self.download_img = download_img

    def run(self):
        while True:
            message = self.img_urls.get().split('|')
            img_url = message[0]
            if img_url == 'finished':
                self.img_urls.put('finished')
                break
            else:
                thread_lock.acquire()
                if img_url in self.download_img:
                    thread_lock.release()
                    continue
                else:
                    thread_lock.release()
                    print 'fetching image %s' % img_url
                    referer = message[1]
                    image = download_image(img_url, referer)
                
                    image_name = re.search('\w+\.jpg', img_url).group(0)
                    with open(r'img\%s' % image_name, 'wb') as img:
                        img.write(image)
                    thread_lock.acquire()
                    self.download_img.add(img_url)
                    thread_lock.release()

#定義了一個線程池
class thread_pool:
    def __init__(self, page_range, page_contents, html_url, first_page):
        self.page_range = page_range
        self.page_contents = page_contents
        self.img_urls = Queue.Queue()
        self.html_url = html_url
        self.first_page = first_page
        self.download_img = set()
        
        self.page_thread_pool = []
        self.image_thread_pool = []
        
    def build_thread(self, page, image):
        for i in range(page):
            t = download_html_page('page thread%d' % i, self.page_range, self.page_contents,
                                    self.img_urls, self.html_url, self.first_page)
            self.page_thread_pool.append(t)
        for i in range(image):
            t = fetch_img('image thread%d' % i, self.img_urls, self.download_img)
            self.image_thread_pool.append(t)
        
    def all_start(self):
        for t in self.page_thread_pool:
            t.start()
        for t in self.image_thread_pool:
            t.start()
    
    def all_join(self):
        for t in self.page_thread_pool:
            t.join()
        for t in self.image_thread_pool:
            t.join()

下面是主線程的代碼:

# -*- coding: utf-8 -*-  
import re
import Queue
import threads

if __name__ == '__main__':
    html_url = raw_input('enter the url: ')
    html_page = threads.download_page(html_url)

    max_page = 0
    title = ''
    if html_page is not None:
        search_title = re.search(r'<span class="s_title"><span style="\S+?">(?P<title>.+?)</span></span>', html_page, re.S)
        title = search_title.groupdict()['title']

        search_page = re.findall(r'<a href="/post-\S+?-\d+?-(?P<page>\d+?)\.shtml">(?P=page)</a>', html_page, re.S)
        for page_number in search_page:
            page_number = int(page_number)
            if page_number > max_page:
                max_page = page_number
                
    print 'title:%s' % title
    print 'max page number: %s' % max_page
    
    start_page = 0
    while start_page < 1 or start_page > max_page:
        start_page = input('input the start page number:')
        
    end_page = 0
    while end_page < start_page or end_page > max_page:
        end_page = input('input the end page number:')
        
    page_range = Queue.Queue()
    for i in range(start_page, end_page + 1):
        page_range.put(i)

    page_contents = {}
    thread_pool = threads.thread_pool(page_range, page_contents, html_url, html_page)
    thread_pool.build_thread(1, 1)
    thread_pool.all_start()
    thread_pool.all_join()
        

 

運行的時候需要手動在python代碼的同級路徑下創建img的文件夾,用來存放圖片,由於本人比較懶,就沒有用python生成這個文件夾,最后下載的結果如下圖,如果現實亂碼的話,需要把網頁的編碼格式設置為Unicode


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM