2.python抓取html中表格中的内容


要获取网页中的table内容,并返回json字符串

http://bbs.ngacn.cc/read.php?tid=12241285

直接上代码

TabelScratch.py
#!/usr/bin/env python
# -*-coding:utf-8-*-

from HTMLParser import HTMLParser
import urllib2
import re


def hello():
    print 'hello'


class TitleParser(HTMLParser):
    def __init__(self):
        self.isFirst = True
        self.count = 0
        self.item = 0
        self.data = '['
        self.handledtags = ['p', '[td]']
        self.processing = None
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if tag in self.handledtags:
            self.processing = tag

    def handle_data(self, data):
        if self.processing:
            if '[td]' in data:
                if self.count != 3:
                    self.count += 1
                    return

                self.item += 1

                tmp = data.rstrip('[/tr]') + ']'
                tmp = tmp.replace('[', '<')
                tmp = tmp.replace(']', '>')
                str = re.findall(r'<td>(.*?)</td>', tmp)[0]
                # print title

                if self.item == 1:
                    if self.isFirst:
                        self.data += '{'+'\"rank\":'+'\"'+str+'\"'+','
                        self.isFirst = False
                    else:
                        self.data += ',{'+'\"rank\":'+'\"'+str+'\"'+','
                elif self.item == 3:
                    self.data += '\"rate\":'+'\"'+str+'\"'
                    self.data += '}'
                    self.item = 0
                else:
                    self.data += '\"name\":'+'\"'+str+'\"'+','

                # self.data += data

    def handle_endtag(self, tag):
        if tag == self.processing:
            self.data += ']'
            # print self.data
            # print str(tag) + ' : ' + str(self.data)
            self.processing = None
            return self.data

            # def handle_entityref(self, name):
            #     if entitydefs.has_key(name):
            #         self.handle_data(entitydefs[name])
            #     else:
            #         self.handle_data('&' + name + ';')
            #
            # def handle_charref(self, name):
            #     try:
            #         charnum = int(name)
            #     except ValueError:
            #         return
            #     if charnum < 1 or charnum > 255:
            #         return
            #     self.handle_data(chr(charnum))


if __name__ == '__main__':
    content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8')
    # content = content.decode('utf8').encode('gbk')
    parser = TitleParser()
    parser.feed(content)
    # hello()

为了能够将数据给其他应用来使用,作出API接口

PythonDemo.py
from flask import Flask
import TabelScratch
import urllib2

app = Flask(__name__)


@app.route('/lscs/rank')
def hello_world():
    content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8')
    # content = content.decode('utf8').encode('gbk')
    parser = TabelScratch.TitleParser()
    parser.feed(content)
    return parser.data


if __name__ == '__main__':
    app.run(
        host="服务器的IP", port=5000
    )

在Linux下启动后台服务

将文件PythonDemo.py和TabelScratch.py放到/home/ops/pyProj/目录下

nohup python -u /home/ops/pyProj/PythonDemo.py > ./out.log &

组后结果

浏览器中输入:http://服务器IP:5000/lscs/rank


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM