要获取网页中的table内容,并返回json字符串
http://bbs.ngacn.cc/read.php?tid=12241285
直接上代码
TabelScratch.py
#!/usr/bin/env python # -*-coding:utf-8-*- from HTMLParser import HTMLParser import urllib2 import re def hello(): print 'hello' class TitleParser(HTMLParser): def __init__(self): self.isFirst = True self.count = 0 self.item = 0 self.data = '[' self.handledtags = ['p', '[td]'] self.processing = None HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag in self.handledtags: self.processing = tag def handle_data(self, data): if self.processing: if '[td]' in data: if self.count != 3: self.count += 1 return self.item += 1 tmp = data.rstrip('[/tr]') + ']' tmp = tmp.replace('[', '<') tmp = tmp.replace(']', '>') str = re.findall(r'<td>(.*?)</td>', tmp)[0] # print title if self.item == 1: if self.isFirst: self.data += '{'+'\"rank\":'+'\"'+str+'\"'+',' self.isFirst = False else: self.data += ',{'+'\"rank\":'+'\"'+str+'\"'+',' elif self.item == 3: self.data += '\"rate\":'+'\"'+str+'\"' self.data += '}' self.item = 0 else: self.data += '\"name\":'+'\"'+str+'\"'+',' # self.data += data def handle_endtag(self, tag): if tag == self.processing: self.data += ']' # print self.data # print str(tag) + ' : ' + str(self.data) self.processing = None return self.data # def handle_entityref(self, name): # if entitydefs.has_key(name): # self.handle_data(entitydefs[name]) # else: # self.handle_data('&' + name + ';') # # def handle_charref(self, name): # try: # charnum = int(name) # except ValueError: # return # if charnum < 1 or charnum > 255: # return # self.handle_data(chr(charnum)) if __name__ == '__main__': content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8') # content = content.decode('utf8').encode('gbk') parser = TitleParser() parser.feed(content) # hello()
为了能够将数据给其他应用来使用,作出API接口
PythonDemo.py
from flask import Flask import TabelScratch import urllib2 app = Flask(__name__) @app.route('/lscs/rank') def hello_world(): content = urllib2.urlopen('http://bbs.ngacn.cc/read.php?tid=12241285').read().decode('gbk').encode('utf8') # content = content.decode('utf8').encode('gbk') parser = TabelScratch.TitleParser() parser.feed(content) return parser.data if __name__ == '__main__': app.run( host="服务器的IP", port=5000 )
在Linux下启动后台服务
将文件PythonDemo.py和TabelScratch.py放到/home/ops/pyProj/目录下
nohup python -u /home/ops/pyProj/PythonDemo.py > ./out.log &
组后结果
浏览器中输入:http://服务器IP:5000/lscs/rank