<爬蟲>常見網址的爬蟲整理


001.百度貼吧

# 是告訴操作系統執行這個腳本的時候,調用/usr/bin下的python3解釋器;
# !/usr/bin/python3
# -*- coding: utf-8 -*-


"""
請求URL分析	https://tieba.baidu.com/f?kw=魔獸世界&ie=utf-8&pn=50
請求方式分析	GET
請求參數分析	pn每頁50發生變化,其他參數固定不變
請求頭分析	只需要添加User-Agent
"""

# 代碼實現流程
# 1. 實現面向對象構建爬蟲對象
# 2. 爬蟲流程四步驟
# 2.1 獲取url列表
# 2.2 發送請求獲取響應
# 2.3 從響應中提取數據
# 2.4 保存數據

import requests


class TieBa_Spier():

	def __init__(self, max_page, kw):
		# 初始化
		self.max_page = max_page  # 最大頁碼
		self.kw = kw  # 貼吧名稱
		self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
		self.headers = {
			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
		}

	def get_url_list(self):
		"""獲取url列表"""
		# 根據pn每50進入下一頁,構建url列表
		return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_page * 50, 50)]

	def get_content(self, url):
		"""發送請求獲取響應內容"""
		response = requests.get(
			url=url,
			headers=self.headers
		)
		# print(response.text)
		return response.content

	def save_items(self, content, idx):
		"""從響應內容中提取數據"""
		with open('{}.html'.format(idx), 'wb') as f:
			f.write(content)
		return None

	def run(self):
		"""運行程序"""
		# 獲取url_list
		url_list = self.get_url_list()
		for url in url_list:
			# 發送請求獲取響應
			content = self.get_content(url)

			# 保存數據,按照url的索引+1命名保存的文件
			items = self.save_items(content, url_list.index(url) + 1)

			# 測試
			# print(items)


if __name__ == '__main__':
	# 最大頁碼,貼吧名
	spider = TieBa_Spier(2, "神無月")
	spider.run()

 002.京東商品評論

# !/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re
import pandas as pd

"""
請求URL分析	https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1
請求方式分析	GET
請求參數分析	page每頁加1發生變化,其他參數固定不變
請求頭分析	不需要添加User-Agent
"""


# 代碼實現流程
# 1. 實現面向對象構建爬蟲對象
# 2. 爬蟲流程四步驟
# 2.1 獲取url列表
# 2.2 發送請求獲取響應
# 2.3 從響應中提取數據
# 2.4 保存數據


class JD_Spier():

	def __init__(self, max_page):
		# 初始化
		self.max_page = max_page  # 最大頁碼
		self.base_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"

	def get_url_list(self):
		"""獲取url列表"""
		# 根據page每1進入下一頁,構建url列表
		return [self.base_url.format(page) for page in range(0, self.max_page, 1)]

	def get_content(self, url):
		"""發送請求獲取響應內容"""
		response = requests.get(url=url)
		# print(response.text)
		return response.text

	def save_items(self, content):
		"""從響應內容中提取數據"""
		with open('comment_iphone11.txt', 'a', encoding='utf-8') as f:
			pat = '"content":"(.*?)","'
			res = re.findall(pat, content)
			for index, i in enumerate(res):
				i = i.replace('\\n', '')
				# print(i)
				f.write(str(index) + ':' + i)
				f.write('\n')
			f.write('\n')
		return None

	def run(self):
		"""運行程序"""
		# 獲取url_list
		url_list = self.get_url_list()
		for index, url in enumerate(url_list):
			# 發送請求獲取響應
			try:
				print('正在爬第%s頁...' % index)
				content = self.get_content(url)
				# 保存數據
				self.save_items(content)
			except:
				print('爬取第' + str(index) + '頁出現問題')
				continue


if __name__ == '__main__':
	# 最大頁碼
	spider = JD_Spier(99)
	spider.run()

順帶做個詞雲圖 

from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud

# 進行分詞的數據
f = open('comment_iphone11.txt','r',encoding='utf-8')
text = f.read()

cut_text = ' '.join(jieba.lcut(text))
print(cut_text)
# 詞雲形狀
color_mask = imread("201910051325286.jpg")
cloud = WordCloud(
    # 注意字體在同路徑
    font_path='FZMWFont.ttf',  # 字體最好放在與腳本相同的目錄下,而且必須設置
    background_color='white',
    mask=color_mask,
    max_words=200,
    max_font_size=5000
)
word_cloud = cloud.generate(cut_text)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()  

效果圖

 

003.豆瓣電影top250(三種解析)

# 目標:爬取豆瓣電影排行榜TOP250的電影信息
# 信息包括:電影名字,上映時間,主演,評分,導演,一句話評價
# 解析用學過的幾種方法都實驗一下①正則表達式.②BeautifulSoup③xpath
import requests
import re  # 正則表達式
import json
from bs4 import BeautifulSoup  # BS
from lxml import etree  # xpath
# 進程池
from multiprocessing import Pool
import multiprocessing



def get_one_page(url):
	headers = {
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
		return response.text
	return None


def zhengze_parse(html):
	pattern = re.compile(
		'<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?'
		+ 'class="inq">(.*?)</span>', re.S)
	items = re.findall(pattern, html)
	# print(items)
	# 因為125個影片沒有描述,根本沒有匹配到- -,更改也簡單,描述單獨拿出來,這里我就不改了
	for item in items:
		yield {
			'index': item[0],
			'title': item[1],
			'image': item[2],
			'score': item[3],
			'people': item[4].strip()[:-2],
			'Evaluation': item[5]
		}


def soup_parse(html):
	soup = BeautifulSoup(html, 'lxml')
	for data in soup.find_all('div', class_='item'):
		index = data.em.text
		image = data.img['src']
		title = data.img['alt']
		people = data.find_all('span')[-2].text[:-2]
		score = data.find('span', class_='rating_num').text
		# 第125個影片沒有描述,用空代替
		if data.find('span', class_='inq'):
			Evaluation = data.find('span', class_='inq').text
		else:
			Evaluation = ''
		yield {
			'index': index,
			'image': image,
			'title': title,
			'people': people,
			'score': score,
			'Evaluation': Evaluation,
		}


def xpath_parse(html):
	html = etree.HTML(html)
	for data in html.xpath('//ol[@class="grid_view"]/li'):
		index = data.xpath('.//em/text()')[0]
		image = data.xpath('.//a/img/@src')[0]
		title = data.xpath('.//a/img/@alt')[0]
		people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2]
		score = data.xpath('.//div[@class="star"]/span[2]/text()')[0]
		# 第125個影片沒有描述,用空代替
		if data.xpath('.//p[@class="quote"]/span/text()'):
			Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0]
		else:
			Evaluation = ''
		yield {
			'index': index,
			'image': image,
			'title': title,
			'people': people,
			'score': score,
			'Evaluation': Evaluation,
		}


def write_to_file(content, flag):
	with open('豆瓣電影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f:
		f.write(json.dumps(content, ensure_ascii=False) + '\n')


def search(Num):
	url = 'https://movie.douban.com/top250?start=' + str(Num)
	html = get_one_page(url)
	for item in zhengze_parse(html):
		write_to_file(item, '正則表達式')
	for item in soup_parse(html):
		write_to_file(item, 'BS4')
	for item in xpath_parse(html):
		write_to_file(item, 'xpath')
	page = str(Num / 25 + 1)
	print("正在爬取第" + page[:-2] + '頁')


def main():
	pool = Pool()
	pool.map(search, [i * 25 for i in range(10)])
	# # 提供頁碼--不用進程池
	# for i in range(0, 10):
	# 	Num = i * 25
	# 	search(Num)
	print("爬取完成")


if __name__ == '__main__':
	# 打包之后,windows執行多進程出錯,需要加入這一行
	multiprocessing.freeze_support()
	# 入口
	main() 

打包成exe可執行文件

pyinstaller -F 豆瓣電影排行.py

運行效果

 004.今日頭條(街拍美圖)

# 拼接URL
from urllib.parse import urlencode
# 請求URL
import requests
# 文件操作
import os
# md5:類似加密,不會重復
from hashlib import md5
# 進程池
from multiprocessing.pool import Pool
# 延遲
import time

base_url = 'https://www.toutiao.com/api/search/content/?'

headers = {
	'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
	'X-Requested-With': 'XMLHttpRequest',
}


def get_page(offset):
	# https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis
	# 根據鏈接傳入params,offset是變化的
	params = {
		'aid': '24',
		'app_name': 'web_search',
		'offset': offset,
		'format': 'json',
		'keyword': '街拍',
		'autoload': 'ture',
		'count': '20',
		'en_qc': '1',
		'cur_tab': '1',
		'from': 'search_tab',
		'pd': 'synthesis',
	}
	url = base_url + urlencode(params)
	# 返回json格式的數據
	try:
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			return response.json()
	except requests.ConnectionError as e:
		print('Error', e.args)


def get_images(json):
	if json:
		items = json.get('data')
		for item in items:
			# 標題
			title = item.get('title')
			# 圖片列表
			images = item.get('image_list')
			for image in images:
				# 返回單個圖片鏈接+標題的字典
				yield {
					'image': image.get('url'),
					'title': title,
				}


def save_image(item):
	# 如果沒有文件夾就創建文件夾
	dirs = 'F:\\domo'
	if not os.path.exists(dirs):
		os.mkdir("F:\\domo")
	# 改變當前工作目錄
	os.chdir('F:\\domo')
	# 如果沒有item傳過來title命名的文件,就創建一個
	if not os.path.exists(item.get('title')):
		os.mkdir(item.get('title'))
	try:
		# 請求圖片URL
		response = requests.get(item.get('image'))
		if response.status_code == 200:
			# 構造圖片名字
			file_path = '{0}\\{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
			# 如果不存在這張圖片就以二進制方式寫入
			if not os.path.exists(file_path):
				with open(file_path, 'wb') as f:
					f.write(response.content)
			else:
				print("已經下載過這個文件了", file_path)
	except:
		print("圖片下載失敗")


GROUP_START = 1
GROUP_END = 20


def main(offset):
	json = get_page(offset)
	for item in get_images(json):
		print(item)
		save_image(item)


if __name__ == '__main__':
	pool = Pool()
	# 構造一個offset列表 20-400(20頁)
	groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
	# 多進程運行main函數
	pool.map(main, groups)
	# 關閉進程池
	pool.close()
	# 等待還沒運行完的進程
	pool.join()

爬10頁左右就不給數據了,需要添加UA池

總結:1.os模塊的基本操作

    os.chdir('路徑') --------------------表示改變當前工作目錄到路徑

    os.path.exists('文件名') ------------當前目錄下是否存在該文件,存在返回Ture,不存在返回False

    os.mkdir()-----------創建文件夾

  2. 用MD5值命名文件,可以有效的解決重復抓取的問題

  3.進程池能大大降低爬取時間

005.微博

# url拼接
from urllib.parse import urlencode
# 去掉html標簽
from pyquery import PyQuery as pq
# 請求
import requests
# 鏈接mongo
from pymongo import MongoClient
# 爬的太快大概36頁的時候就會出現418,加點延遲吧
import time

# 連接
client = MongoClient()
# 指定數據庫
db = client['weibo']
# 指定表
collection = db['weibo_domo2']

max_page = 100


# 存儲到mongoDB
def save_to_mongo(result):
	if collection.insert(result):
		print("saved to mongo")


# https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2
# 找到X-Requested-With: XMLHttpRequest的Ajax請求
# 基礎url,之后利用urlencode進行拼接
base_url = 'https://m.weibo.cn/api/container/getIndex?'

# https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474
headers = {
	'host': 'm.weibo.cn',
	# 手機端打開,查到鏈接,在解析
	# 'Referer': 'https://m.weibo.cn/p/1005052830678474',
	'Referer': 'https://m.weibo.cn/u/2202323951',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
	'X-Requested-With': 'XMLHttpRequest',
}


def get_page(page):
	params = {
		'type': 'uid',
		'value': '2202323951',
		# 'containerid': '1076032830678474',
		'containerid': '1076032202323951',
		'page': page,
	}
	url = base_url + urlencode(params)
	print(url)
	try:
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			# response = json.dump(response.text)
			return response.json(), page
	except requests.ConnectionError as e:
		print('Error', e.args)


def parse_page(json, page: int):
	if json:
		# 只需要data下的cards內的數據
		items = json.get('data').get('cards')
		# index 下標
		for index, item in enumerate(items):
			# 在第一頁,index==1沒有mblog,只有這個沒用,所以直接循環會導則索引報錯
			# 跳過這段
			if index == 1 and page == 1:
				continue
			else:
				item = item.get('mblog')
				weibo = {}
				# 微博ID
				# "id":"4349509976406880",
				weibo['ID'] = item.get('id')
				# 微博內容 使用pq去掉html標簽
				weibo['text'] = pq(item.get('text')).text()
				# 發表所用手機
				weibo['phone'] = item.get('source')
				# 發表時間
				weibo['time'] = item.get('edit_at')
				# 贊數量 attitudes:態度,意思,姿態
				weibo['attitudes'] = item.get('attitudes_count')
				# 評論數 comment:評論
				weibo['comments'] = item.get('comments_count')
				# 轉發數 repost:轉帖
				weibo['reposts'] = item.get('reposts_count')
				yield weibo


if __name__ == '__main__':
	for page in range(1, max_page + 1):
		json = get_page(page)
		# *json==*args 將返回的json和page傳入
		results = parse_page(*json)
		time.sleep(3)
		for result in results:
			print(result)
			save_to_mongo(result) 

總結:

  1.不加延遲爬到36-38頁會出現418  (418 I’m a teapot 服務器拒絕嘗試用 “茶壺沖泡咖啡”。)

  2. Ajax請求中可能在中間出現不是你想要的數據,例如微博page1,index1代表的是關注列表,關注的信息,不是你想要的數據

  3.使用手機端獲取Ajax數據,比在PC端,容易很多.

  4.啟動mongo需要先指定dbpath(數據存儲的地方),查詢插入文件的數量

    形如:mongod --dbpath="F:\MongoDB\Server\3.4\data"

    形如: db.weibo_domo2.find().count()

  5.最終爬取出了朱子奇的所有微博,一共959條

006.貓眼電影top100

https://www.cnblogs.com/shuimohei/p/10400814.html

007.百度百科

https://www.cnblogs.com/shuimohei/p/10339891.html

008.斗魚直播

'''
Ajax含有很多加密參數,我們無法直接進行爬取,只能借助Selenium
'''
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import unittest
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time


class douyu(unittest.TestCase):
	# 初始化方法,必須是setUp()
	def setUp(self):
		# self.driver = webdriver.Chrome()
		self.driver = webdriver.PhantomJS()
		self.num = 0
		self.count = 0

	# 測試方法必須有test字樣開頭
	def testDouyu(self):
		self.driver.get("https://www.douyu.com/directory/all")

		while True:
			soup = bs(self.driver.page_source, "lxml")
			# 房間名, 返回列表
			names = soup.find_all("h3", {"class": "DyListCover-intro"})
			# 直播間熱度, 返回列表
			numbers = soup.find_all("span", {"class": "DyListCover-hot"})
			print(names,numbers)
			for name, number in zip(names, numbers):
				self.num += 1
				print(
					u"直播間熱度: -" + number.get_text().strip() + u"-\t房間名: " + name.get_text().strip() + u'-\t直播數量' + str(
						self.num))
				result = u"直播間熱度: -" + number.get_text().strip() + u"-\t房間名: " + name.get_text().strip() + u'-\t直播數量' + str(
					self.num)
				with open('123.txt', 'a', encoding='utf-8') as f:
					f.write(result)

			# self.count += int(number.get_text().strip())

			# 如果在頁面源碼里找到"下一頁"為隱藏的標簽,就退出循環
			if self.driver.page_source.find("dy-Pagination-disabled dy-Pagination-next") != -1:
				break

			#網絡不好,加個延時,也可以考慮用直到標簽能夠點擊的判斷
			time.sleep(1)
			# 一直點擊下一頁
			self.driver.find_element_by_class_name("dy-Pagination-next").click()
			time.sleep(1)

	# 測試結束執行的方法
	def tearDown(self):
		# 退出PhantomJS()瀏覽器
		print("當前網站直播人數" + str(self.num))
		print("當前網站總熱度" + str(self.count))
		self.driver.quit()


if __name__ == "__main__":
	# 啟動測試模塊
	unittest.main()

selenium還是慢了點,加了延時后更慢了

 009.陽光熱線問政平台

1.創建項目

scrapy startproject dongguan

2.創建爬蟲

scrapy genspider -t crawl sun  wz.sun0769.com

3.items.py  

import scrapy


class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    data = scrapy.Field()
    num = scrapy.Field()
    content = scrapy.Field()
    url = scrapy.Field()

4.sun.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem


class SunSpider(CrawlSpider):
	name = 'sun'
	allowed_domains = ['wz.sun0769.com']
	start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
	rules = (
		# 翻頁
		Rule(LinkExtractor(allow=r'page=\d+'), follow=True),
		# 每個鏈接的
		Rule(LinkExtractor(allow=r'id=\d+'), callback='parse_item', follow=False),
	)

	def parse_item(self, response):
		print(response.url)
		print(response)
		item = DongguanItem()
		item['title'] = response.xpath('//p[@class="focus-details"]/text()').extract_first()
		item['data'] = response.xpath('//span[@class="fl"]/text()').extract()[0][4:]
		item['num'] = response.xpath('//span[@class="fl"]/text()').extract()[2][3:]
		# normalize-space,xpath中去掉\r\t\n
		item['content'] = response.xpath('normalize-space(//div[@class="details-box"]/pre/text())').extract_first()
		item['url'] = response.url

		yield item 

5.pipelines.py

import json


class DongguanPipeline(object):

	def __init__(self):
		self.filename = open('dongguan.txt', 'wb')

	def process_item(self, item, spider):
		text = json.dumps(dict(item), ensure_ascii=False) + '\n'
		self.filename.write(text.encode('utf-8'))
		return item

	def close_spider(self, spider):
		self.filename.close()

6.settings.py

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
   'dongguan.pipelines.DongguanPipeline': 300,
}


# 日志文件名和處理等級
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"

7.運行爬蟲 

scrapy crawl sun

8.運行效果  

 

 

010.新浪網分類資訊整站爬蟲

1.創建項目

scrapy startproject sina

 

2.創建爬蟲

scrapy genspider xinlang sina.com.cn

  

3.items.py 

# -*- coding: utf-8 -*-

import scrapy
import sys, importlib

importlib.reload(sys)


class SinaItem(scrapy.Item):
	# 第一層:大類的標題 和 url
	parentTitle = scrapy.Field()
	parentUrls = scrapy.Field()

	# 第二層:小類的標題 和 子url
	subTitle = scrapy.Field()
	subUrls = scrapy.Field()

	# 存儲到本地:小類目錄存儲路徑
	subFilename = scrapy.Field()

	# 第三層:小類下的子鏈接
	sonUrls = scrapy.Field()

	# 抓到數據:文章標題和內容
	head = scrapy.Field()
	content = scrapy.Field()

  

4.xinlang.py----新聞的解析方式太多了,沒有寫完全

# -*- coding: utf-8 -*-
import scrapy
# 創建文件夾
import os
from sina.items import SinaItem


class XinlangSpider(scrapy.Spider):
	name = 'xinlang'
	allowed_domains = ['sina.com.cn']
	start_urls = ['http://news.sina.com.cn/guide/']

	def parse(self, response):
		items = []
		# 用xpath找出所有大類的URL和標題 19個
		parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
		parentTitle = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()

		# 找出所有小類的ur 和 標題 299個
		subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
		subTitle = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()

		# 爬取所有大類
		for i in range(0, len(parentTitle)):
			# 指定大類目錄的路徑和目錄名
			parentFilename = "./Data/" + parentTitle[i]

			# 如果目錄不存在,則創建目錄
			if (not os.path.exists(parentFilename)):
				os.makedirs(parentFilename)

			# 爬取所有小類
			for j in range(0, len(subUrls)):
				item = SinaItem()

				# 保存大類的title和urls
				item['parentTitle'] = parentTitle[i]
				item['parentUrls'] = parentUrls[i]

				# 檢查小類的url是否以同類別大類url開頭,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
				if_belong = subUrls[j].startswith(item['parentUrls'])

				# 如果屬於本大類,將存儲目錄放在本大類目錄下
				if (if_belong):
					subFilename = parentFilename + '/' + subTitle[j]
					# 如果目錄不存在,則創建目錄
					if (not os.path.exists(subFilename)):
						os.makedirs(subFilename)

					# 存儲 小類url、title和filename字段數據
					item['subUrls'] = subUrls[j]
					item['subTitle'] = subTitle[j]
					item['subFilename'] = subFilename

					items.append(item)

			# 發送每個小類url的Request請求,得到Response連同包含meta數據 一同交給回調函數 second_parse 方法處理
			for item in items:
				yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)

	# 對於返回的小類的url,再進行遞歸請求
	def second_parse(self, response):
		# 提取每次Response的meta數據
		meta_1 = response.meta['meta_1']

		# 取出小類里所有子鏈接,只要a標簽下的鏈接
		sonUrls = response.xpath('//a/@href').extract()

		items = []
		for i in range(0, len(sonUrls)):
			# 檢查每個鏈接是否以大類url開頭、以.shtml結尾,如果是返回True,確保是個新聞
			if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

			# 如果屬於本大類,獲取字段值放在同一個item下便於傳輸
			if (if_belong):
				item = SinaItem()
				item['parentTitle'] = meta_1['parentTitle']
				item['parentUrls'] = meta_1['parentUrls']
				item['subUrls'] = meta_1['subUrls']
				item['subTitle'] = meta_1['subTitle']
				item['subFilename'] = meta_1['subFilename']
				item['sonUrls'] = sonUrls[i]
				items.append(item)

		# 發送每個小類下子鏈接url的Request請求,得到Response后連同包含meta數據 一同交給回調函數 detail_parse 方法處理
		for item in items:
			yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)

	# 數據解析方法,獲取文章標題和內容
	def detail_parse(self, response):
		item = response.meta['meta_2']
		content = ""
		head = response.xpath('//h1[@class="main-title"]/text()').extract()
		content_list = response.xpath('//div[@class="article"]/p/text()').extract()
		# 如果新聞的類型沒有匹配到
		if len(content_list) < 1:
			# 按照新聞中心的匹配http://news.sina.com.cn/w/2004-12-20/11314575163s.shtml
			head = response.xpath('//th[@class="f24"]//h1/text()').extract()
			content_list = response.xpath('//td[@class="l17"]/font/p/text()').extract()
		if len(content_list) < 1:
			# http://news.sina.com.cn/c/2012-09-21/092225223127.shtml
			head = response.xpath('//div[@class="blk_content"]/h1/text()').extract()
			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
		if len(content_list) < 1:
			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
			head = response.xpath('//h1[@id="artibodyTitle"]/text()').extract()
			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
		if len(content_list) < 1:
			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
			head = response.xpath('//h1[@class="main-title"]/text()').extract()
			content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
		if len(content_list) < 1:
			# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
			head = response.xpath('//h1[@id="artibodyTitle"]/font/text()').extract()
			content_list = response.xpath('//div[@id="artibody"]//span/text()').extract()

		if len(head) < 1:
			# 漏網只魚
			head = ['error']
			content_list = [response.url]
		# 將p標簽里的文本內容合並到一起
		for content_one in content_list:
			content += content_one

		item['head'] = head
		item['content'] = content

		yield item

  

5.pipelines.py

import json
from scrapy import signals


class SinaPipeline(object):

	def process_item(self, item, spider):
		sonUrls = item['sonUrls']

		# 文件名為子鏈接url中間部分,並將 / 替換為 _,保存為 .txt格式
		filename = sonUrls[7:-6].replace('/', '_')
		filename += ".txt"

		fp = open(item['subFilename'] + '/' + filename, 'w', encoding='utf-8')
		fp.write(item['content'])
		fp.close()

		return item

 

6. settings.py 

BOT_NAME = 'sina'

SPIDER_MODULES = ['sina.spiders']
NEWSPIDER_MODULE = 'sina.spiders'


ROBOTSTXT_OBEY = False



DOWNLOAD_DELAY = 0.5

ITEM_PIPELINES = {
   'sina.pipelines.SinaPipeline': 300,
}



# 日志文件名和處理等級
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"

  

7.main.py

在項目根目錄下新建main.py文件,用於調試

from scrapy import cmdline

cmdline.execute('scrapy crawl xinlang'.split())

  

8.執行程序

運行main.py文件即可

 

9.效果

能爬一部分新聞,不夠完善

請求成功次數:4416

最大深度:2

 

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM