一個逐頁抓取網站小說的爬蟲


需求:

抓取某些網站上的小說,按頁抓取

每頁都有next 按鈕,獲取這寫next 按鈕的 href 然后 就可以逐頁抓取

解析網頁使用beautisoup

 

from bs4 import BeautifulSoup
import urllib2
import time

import sys


#http://www.vc.com/htm/2016/12/24/t02/367246.html
host_name = 'http://www.vc.com'

def html_process(html_file,url):
	'''
	use bs to get the titile && contain && next link from html_file
	'''
	global host_name

	#soup = BeautifulSoup(open(html_file),"html_parser")
	soup = BeautifulSoup(html_file,"html.parser")

	#####################################################
	text = '/dev/shm/novel.txt'
	file = open(text,'a')
	file.write('######################################')
	file.write('\r\n' + url + '\r\n')

	#####################################################
	#get title
	title_ret = soup.title.string.split('-')[0].strip()
	file.write('\r\n@# '+ title_ret+ '\r\n')
	#####################################################
	#get context
	file.write( soup.find("div",id='view2').get_text() + '\r\n')
	file.close()

	#####################################################
	#get next href
	link = soup.find_all("li",class_ = "next")[0]
	if None == link:
		print 'next link is None'
		exit(0)
	next_href = host_name + link.a['href'] 

	return next_href


def html_get(url):
	user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0"
	headers = {'User-Agent':user_agent}
	req = urllib2.Request(url,headers = headers)
	try:
		page = urllib2.urlopen(req,timeout=20).read()
		return page
	except urllib2.URLError,e:
		print "error while loading" + url
		exit(1)
	except socket.timeout:
		#do retry
		return html_get(url)

def test(url):
	while None != url:
		html_file = html_get(url)
		if None == html_file:
			print 'ERROR OF READING ',url
			exit(1)
		url = html_process(html_file,url)
		time.sleep(5)

if __name__ == '__main__':
	reload(sys)
	sys.setdefaultencoding( "utf-8" )
	#start up url 
	test("http://www.vc.com/htm/2013/11/2/t02/316551.html")

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM