最近想做一個小web應用,就是把豆瓣讀書和亞馬遜等寫有書評的網站上關於某本書的打分記錄下來,這樣自己買書的時候當作參考。
這篇日志這是以豆瓣網為例,只討論簡單的功能。
向服務器發送查詢請求
這很好處理,找到網站的搜索框,然后填入相關信息,提交后查看url即可。
這里以豆瓣為例,當我在http://book.douban.com頁面的搜索框中輸入 現代操作系統 后得到下面的url:
http://book.douban.com/subject_search?search_text=%E7%8E%B0%E4%BB%A3%E6%93%8D%E4%BD%9C%E7%B3%BB%E7%BB%9F&cat=1001
這樣就知道如何向服務器提交查詢請求了,注意search_text后面的一串字符只是編碼不同(。。。)。
利用Urllib2和Urllib庫發送和獲取HTTP頁面
詳見下面代碼:
book_name = '現代操作系統'
douban_book = 'http://book.douban.com/subject_search?'
search = [('search_text','現代操作系統'),('cat','1001')]
getbook = douban_book + urllib.urlencode(search)
content = urllib2.urlopen(getbook).read()
利用SGMLParser庫解析HTTP文本
- 第一步,利用瀏覽器自帶的查看頁面信息的工具,查看頁面布局。

- 根據布局,思考解析的方法。這一步很主要,決定了第三步的效率
- 編寫代碼。基本上就是重寫SGMLParser子類的方法。
詳細代碼
代碼寫的很亂,一些語法還不是很熟悉。我是以寫代碼來學習Python的,什么不懂就查什么。
# -*- coding: utf-8 -*-
import urllib2
import urllib
from sgmllib import SGMLParser
class BookInfo(SGMLParser):
def reset(self):
SGMLParser.reset(self)
# 標記對應的標簽
self.is_subject = 0
self.is_subject_info = 0
self.is_subject_h2 = 0
self.is_subject_pub = 0
self.is_subject_star = 0
self.temp = {} # 一個字典,保存暫時的信息
self.info = [] # 一個列表,保存所有的信息
# li標簽開始出現
def start_li(self,attrs):
if 'subject-item' in [v for k, v in attrs if k == 'class']:
self.is_subject = 1
# li標簽結束
def end_li(self):
self.is_subject = 0
def start_h2(self,attrs):
if self.is_subject == 1 and '' in [v for k,v in attrs if k == 'class']:
self.is_subject_h2 = 1
def end_h2(self):
self.is_subject_h2 = 0
def start_div(self,attrs):
attr = ''
for k,v in attrs:
if k == 'class':
attr = v
break
if attr == 'info' and self.is_subject == 1:
self.is_subject_info = 1
elif attr == 'pub' and self.is_subject_info == 1:
self.is_subject_pub = 1
elif attr == 'star clearfix' and self.is_subject_info == 1:
self.is_subject_star = 1
else:
pass
def end_div(self):
if self.is_subject_star == 0:
if self.is_subject_pub == 0:
self.is_subject_info = 0
self.info.append(self.temp)
self.temp = {}
else:
self.is_subject_pub = 0
else:
self.is_subject_star = 0
def handle_data(self,data):
if self.is_subject_h2:
string = data.strip()
if len(string):
if 'name' in self.temp:
self.temp['name'] = self.temp['name'] + string
else:
self.temp['name'] = string
#print string
elif self.is_subject_pub:
string = data.strip()
if len(string):
if 'pub' in self.temp:
self.temp['pub'] = self.temp['pub']+string
else:
self.temp['pub'] = string
elif self.is_subject_star:
string = data.strip()
if len(string):
if 'star' in self.temp:
self.temp['star'] = self.temp['star'] + string
else:
self.temp['star'] = string
#print string
else:
pass
book_name = '現代操作系統'
douban_book = 'http://book.douban.com/subject_search?'
search = [('search_text','現代操作系統'),('cat','1001')]
getbook = douban_book + urllib.urlencode(search)
print getbook
content = urllib2.urlopen(getbook).read()
fobj = open('book.txt','w')
fileobj = open('books.txt','w')
book = BookInfo()
book.feed(content)
for books in book.info:
for item in books:
print '*************************************************'
print '書名:%s' % books['name']
if 'pub' in books:
print '出版信息:%s' % books['pub']
if 'star' in books:
print '評價:%s' % books['star']
break
fobj.write(content)
fobj.close()
fileobj.close()
輸出結果

這只是開頭的第一步,以后的日子里不斷的學習和實踐。。。
Bug修復和改進
上面的代碼其實還是有問題的,只是沒用被發現。當標記第一個 div 標簽的確是沒用問題,但是當出現第二個div標簽時,如果第 二個是第一個的子元素,那么當處理第二個子標簽的/div 閉合標簽的時候就會出錯。
一個小小的改進。這個程序嚴格要求輸入的是正確的書名,這樣處理的結果才是正確的。如果不是完全正確的書名,我的代碼量就成集合倍增加了。在豆瓣讀書中,評價書小於特定的數目時,是沒有評論的(代表這個版次的書一般是很久的,上個世紀的書了),那么就沒有參考價值了。
下面是修改后的代碼:
# -*- coding: utf-8 -*-
import urllib2
import urllib
from sgmllib import SGMLParser
class BookInfo(SGMLParser):
def reset(self):
SGMLParser.reset(self)
# 標記對應的標簽
self.is_subject = 0
self.is_subject_info = 0
self.is_subject_h2 = 0
self.is_subject_pub = 0
self.is_subject_star = 0
self.is_subject_rating = 0
self.temp = {} # 一個字典,保存暫時的信息
self.info = [] # 一個列表,保存所有的信息
# li標簽開始出現
def start_li(self,attrs):
if 'subject-item' in [v for k, v in attrs if k == 'class']:
self.is_subject = 1
# li標簽結束
def end_li(self):
self.is_subject = 0
def start_h2(self,attrs):
if self.is_subject == 1 and '' in [v for k,v in attrs if k == 'class']:
self.is_subject_h2 = 1
def end_h2(self):
self.is_subject_h2 = 0
def start_div(self,attrs):
attr = ''
for k,v in attrs:
if k == 'class':
attr = v
break
if attr == 'info' and self.is_subject == 1:
self.is_subject_info = 1
elif attr == 'pub' and self.is_subject_info == 1:
self.is_subject_pub = 1
elif attr == 'star clearfix' and self.is_subject_info == 1:
self.is_subject_star = 1
else:
pass
def end_div(self):
if self.is_subject_info:
if self.is_subject_pub:
if self.is_subject_star:
self.is_subject_star = 0
self.is_subject_rating = 0
else:
self.is_subject_pub = 0
elif self.is_subject_star:
self.is_subject_star = 0
self.is_subject_rating = 0
if len(self.temp) == 3:
self.info.append(self.temp)
self.temp = {}
else:
self.is_subject_info = 0
def start_span(self,attrs):
if self.is_subject_star and 'allstar45' in [v for k,v in attrs if k == 'class']:
print [v for k,v in attrs if k == 'class']
self.is_subject_rating = 1
def handle_data(self,data):
if self.is_subject_h2:
string = data.strip()
if len(string):
if 'name' in self.temp:
self.temp['name'] = self.temp['name'] + string
else:
self.temp['name'] = string
if string != book_name:
self.temp = {}
#print string
elif self.is_subject_pub:
string = data.strip()
if len(string):
if 'pub' in self.temp:
self.temp['pub'] = self.temp['pub']+string
else:
self.temp['pub'] = string
elif self.is_subject_star:
string = data.strip()
if len(string) and self.is_subject_rating:
if 'star' in self.temp:
self.temp['star'] = self.temp['star'] + string
else:
self.temp['star'] = string
print string
else:
pass
book_name = '現代操作系統'
douban_book = 'http://book.douban.com/subject_search?'
search = [('search_text','現代操作系統'),('cat','1001')]
getbook = douban_book + urllib.urlencode(search)
print getbook
content = urllib2.urlopen(getbook).read()
fobj = open('book.txt','w')
fileobj = open('books.txt','w')
book = BookInfo()
book.feed(content)
for books in book.info:
for item in books:
print '*************************************************'
print '書名:%s' % books['name']
if 'pub' in books:
print '出版信息:%s' % books['pub']
if 'star' in books:
print '評價:%s' % books['star']
break
fobj.write(content)
fobj.close()
fileobj.close()
下面是輸出結果:

以后程序修改就是將這本書的所有版本的評價綜合起來,在加上亞馬遜的評價,就可以了。
-end-
