2.python抓取html中表格中的內容


要獲取網頁中的table內容,並返回json字符串

http://bbs.ngacn.cc/read.php?tid=12241285

直接上代碼

TabelScratch.py
#!/usr/bin/env python
# -*-coding:utf-8-*-

from HTMLParser import HTMLParser
import urllib2
import re


def hello():
    print 'hello'


class TitleParser(HTMLParser):
    def __init__(self):
        self.isFirst = True
        self.count = 0
        self.item = 0
        self.data = '['
        self.handledtags = ['p', '[td]']
        self.processing = None
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if tag in self.handledtags:
            self.processing = tag

    def handle_data(self, data):
        if self.processing:
            if '[td]' in data:
                if self.count != 3:
                    self.count += 1
                    return

                self.item += 1

                tmp = data.rstrip('[/tr]') + ']'
                tmp = tmp.replace('[', '<')
                tmp = tmp.replace(']', '>')
                str = re.findall(r'<td>(.*?)</td>', tmp)[0]
                # print title

                if self.item == 1:
                    if self.isFirst:
                        self.data += '{'+'\"rank\":'+'\"'+str+'\"'+','
                        self.isFirst = False
                    else:
                        self.data += ',{'+'\"rank\":'+'\"'+str+'\"'+','
                elif self.item == 3:
                    self.data += '\"rate\":'+'\"'+str+'\"'
                    self.data += '}'
                    self.item = 0
                else:
                    self.data += '\"name\":'+'\"'+str+'\"'+','

                # self.data += data

    def handle_endtag(self, tag):
        if tag == self.processing:
            self.data += ']'
            # print self.data
            # print str(tag) + ' : ' + str(self.data)
            self.processing = None
            return self.data

            # def handle_entityref(self, name):
            #     if entitydefs.has_key(name):
            #         self.handle_data(entitydefs[name])
            #     else:
            #         self.handle_data('&' + name + ';')
            #
            # def handle_charref(self, name):
            #     try:
            #         charnum = int(name)
            #     except ValueError:
            #         return
            #     if charnum < 1 or charnum > 255:
            #         return
            #     self.handle_data(chr(charnum))


if __name__ == '__main__':
    content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8')
    # content = content.decode('utf8').encode('gbk')
    parser = TitleParser()
    parser.feed(content)
    # hello()

為了能夠將數據給其他應用來使用,作出API接口

PythonDemo.py
from flask import Flask
import TabelScratch
import urllib2

app = Flask(__name__)


@app.route('/lscs/rank')
def hello_world():
    content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8')
    # content = content.decode('utf8').encode('gbk')
    parser = TabelScratch.TitleParser()
    parser.feed(content)
    return parser.data


if __name__ == '__main__':
    app.run(
        host="服務器的IP", port=5000
    )

在Linux下啟動后台服務

將文件PythonDemo.py和TabelScratch.py放到/home/ops/pyProj/目錄下

nohup python -u /home/ops/pyProj/PythonDemo.py > ./out.log &

組后結果

瀏覽器中輸入:http://服務器IP:5000/lscs/rank


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM