# 是告訴操作系統執行這個腳本的時候,調用/usr/bin下的python3解釋器;
# !/usr/bin/python3
# -*- coding: utf-8 -*-
"""
請求URL分析 https://tieba.baidu.com/f?kw=魔獸世界&ie=utf-8&pn=50
請求方式分析 GET
請求參數分析 pn每頁50發生變化,其他參數固定不變
請求頭分析 只需要添加User-Agent
"""
# 代碼實現流程
# 1. 實現面向對象構建爬蟲對象
# 2. 爬蟲流程四步驟
# 2.1 獲取url列表
# 2.2 發送請求獲取響應
# 2.3 從響應中提取數據
# 2.4 保存數據
import requests
class TieBa_Spier():
def __init__(self, max_page, kw):
# 初始化
self.max_page = max_page # 最大頁碼
self.kw = kw # 貼吧名稱
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def get_url_list(self):
"""獲取url列表"""
# 根據pn每50進入下一頁,構建url列表
return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_page * 50, 50)]
def get_content(self, url):
"""發送請求獲取響應內容"""
response = requests.get(
url=url,
headers=self.headers
)
# print(response.text)
return response.content
def save_items(self, content, idx):
"""從響應內容中提取數據"""
with open('{}.html'.format(idx), 'wb') as f:
f.write(content)
return None
def run(self):
"""運行程序"""
# 獲取url_list
url_list = self.get_url_list()
for url in url_list:
# 發送請求獲取響應
content = self.get_content(url)
# 保存數據,按照url的索引+1命名保存的文件
items = self.save_items(content, url_list.index(url) + 1)
# 測試
# print(items)
if __name__ == '__main__':
# 最大頁碼,貼吧名
spider = TieBa_Spier(2, "神無月")
spider.run()
002.京東商品評論
# !/usr/bin/python3
# -*- coding: utf-8 -*-
import requests
import re
import pandas as pd
"""
請求URL分析 https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1
請求方式分析 GET
請求參數分析 page每頁加1發生變化,其他參數固定不變
請求頭分析 不需要添加User-Agent
"""
# 代碼實現流程
# 1. 實現面向對象構建爬蟲對象
# 2. 爬蟲流程四步驟
# 2.1 獲取url列表
# 2.2 發送請求獲取響應
# 2.3 從響應中提取數據
# 2.4 保存數據
class JD_Spier():
def __init__(self, max_page):
# 初始化
self.max_page = max_page # 最大頁碼
self.base_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv4962&productId=5089225&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
def get_url_list(self):
"""獲取url列表"""
# 根據page每1進入下一頁,構建url列表
return [self.base_url.format(page) for page in range(0, self.max_page, 1)]
def get_content(self, url):
"""發送請求獲取響應內容"""
response = requests.get(url=url)
# print(response.text)
return response.text
def save_items(self, content):
"""從響應內容中提取數據"""
with open('comment_iphone11.txt', 'a', encoding='utf-8') as f:
pat = '"content":"(.*?)","'
res = re.findall(pat, content)
for index, i in enumerate(res):
i = i.replace('\\n', '')
# print(i)
f.write(str(index) + ':' + i)
f.write('\n')
f.write('\n')
return None
def run(self):
"""運行程序"""
# 獲取url_list
url_list = self.get_url_list()
for index, url in enumerate(url_list):
# 發送請求獲取響應
try:
print('正在爬第%s頁...' % index)
content = self.get_content(url)
# 保存數據
self.save_items(content)
except:
print('爬取第' + str(index) + '頁出現問題')
continue
if __name__ == '__main__':
# 最大頁碼
spider = JD_Spier(99)
spider.run()
順帶做個詞雲圖
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
# 進行分詞的數據
f = open('comment_iphone11.txt','r',encoding='utf-8')
text = f.read()
cut_text = ' '.join(jieba.lcut(text))
print(cut_text)
# 詞雲形狀
color_mask = imread("201910051325286.jpg")
cloud = WordCloud(
# 注意字體在同路徑
font_path='FZMWFont.ttf', # 字體最好放在與腳本相同的目錄下,而且必須設置
background_color='white',
mask=color_mask,
max_words=200,
max_font_size=5000
)
word_cloud = cloud.generate(cut_text)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
效果圖

003.豆瓣電影top250(三種解析)
# 目標:爬取豆瓣電影排行榜TOP250的電影信息
# 信息包括:電影名字,上映時間,主演,評分,導演,一句話評價
# 解析用學過的幾種方法都實驗一下①正則表達式.②BeautifulSoup③xpath
import requests
import re # 正則表達式
import json
from bs4 import BeautifulSoup # BS
from lxml import etree # xpath
# 進程池
from multiprocessing import Pool
import multiprocessing
def get_one_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def zhengze_parse(html):
pattern = re.compile(
'<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?'
+ 'class="inq">(.*?)</span>', re.S)
items = re.findall(pattern, html)
# print(items)
# 因為125個影片沒有描述,根本沒有匹配到- -,更改也簡單,描述單獨拿出來,這里我就不改了
for item in items:
yield {
'index': item[0],
'title': item[1],
'image': item[2],
'score': item[3],
'people': item[4].strip()[:-2],
'Evaluation': item[5]
}
def soup_parse(html):
soup = BeautifulSoup(html, 'lxml')
for data in soup.find_all('div', class_='item'):
index = data.em.text
image = data.img['src']
title = data.img['alt']
people = data.find_all('span')[-2].text[:-2]
score = data.find('span', class_='rating_num').text
# 第125個影片沒有描述,用空代替
if data.find('span', class_='inq'):
Evaluation = data.find('span', class_='inq').text
else:
Evaluation = ''
yield {
'index': index,
'image': image,
'title': title,
'people': people,
'score': score,
'Evaluation': Evaluation,
}
def xpath_parse(html):
html = etree.HTML(html)
for data in html.xpath('//ol[@class="grid_view"]/li'):
index = data.xpath('.//em/text()')[0]
image = data.xpath('.//a/img/@src')[0]
title = data.xpath('.//a/img/@alt')[0]
people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2]
score = data.xpath('.//div[@class="star"]/span[2]/text()')[0]
# 第125個影片沒有描述,用空代替
if data.xpath('.//p[@class="quote"]/span/text()'):
Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0]
else:
Evaluation = ''
yield {
'index': index,
'image': image,
'title': title,
'people': people,
'score': score,
'Evaluation': Evaluation,
}
def write_to_file(content, flag):
with open('豆瓣電影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def search(Num):
url = 'https://movie.douban.com/top250?start=' + str(Num)
html = get_one_page(url)
for item in zhengze_parse(html):
write_to_file(item, '正則表達式')
for item in soup_parse(html):
write_to_file(item, 'BS4')
for item in xpath_parse(html):
write_to_file(item, 'xpath')
page = str(Num / 25 + 1)
print("正在爬取第" + page[:-2] + '頁')
def main():
pool = Pool()
pool.map(search, [i * 25 for i in range(10)])
# # 提供頁碼--不用進程池
# for i in range(0, 10):
# Num = i * 25
# search(Num)
print("爬取完成")
if __name__ == '__main__':
# 打包之后,windows執行多進程出錯,需要加入這一行
multiprocessing.freeze_support()
# 入口
main()
打包成exe可執行文件
pyinstaller -F 豆瓣電影排行.py
運行效果

004.今日頭條(街拍美圖)
# 拼接URL
from urllib.parse import urlencode
# 請求URL
import requests
# 文件操作
import os
# md5:類似加密,不會重復
from hashlib import md5
# 進程池
from multiprocessing.pool import Pool
# 延遲
import time
base_url = 'https://www.toutiao.com/api/search/content/?'
headers = {
'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(offset):
# https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis
# 根據鏈接傳入params,offset是變化的
params = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'ture',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
}
url = base_url + urlencode(params)
# 返回json格式的數據
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def get_images(json):
if json:
items = json.get('data')
for item in items:
# 標題
title = item.get('title')
# 圖片列表
images = item.get('image_list')
for image in images:
# 返回單個圖片鏈接+標題的字典
yield {
'image': image.get('url'),
'title': title,
}
def save_image(item):
# 如果沒有文件夾就創建文件夾
dirs = 'F:\\domo'
if not os.path.exists(dirs):
os.mkdir("F:\\domo")
# 改變當前工作目錄
os.chdir('F:\\domo')
# 如果沒有item傳過來title命名的文件,就創建一個
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
# 請求圖片URL
response = requests.get(item.get('image'))
if response.status_code == 200:
# 構造圖片名字
file_path = '{0}\\{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
# 如果不存在這張圖片就以二進制方式寫入
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print("已經下載過這個文件了", file_path)
except:
print("圖片下載失敗")
GROUP_START = 1
GROUP_END = 20
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
# 構造一個offset列表 20-400(20頁)
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
# 多進程運行main函數
pool.map(main, groups)
# 關閉進程池
pool.close()
# 等待還沒運行完的進程
pool.join()
爬10頁左右就不給數據了,需要添加UA池
總結:1.os模塊的基本操作
os.chdir('路徑') --------------------表示改變當前工作目錄到路徑
os.path.exists('文件名') ------------當前目錄下是否存在該文件,存在返回Ture,不存在返回False
os.mkdir()-----------創建文件夾
2. 用MD5值命名文件,可以有效的解決重復抓取的問題
3.進程池能大大降低爬取時間
005.微博
# url拼接
from urllib.parse import urlencode
# 去掉html標簽
from pyquery import PyQuery as pq
# 請求
import requests
# 鏈接mongo
from pymongo import MongoClient
# 爬的太快大概36頁的時候就會出現418,加點延遲吧
import time
# 連接
client = MongoClient()
# 指定數據庫
db = client['weibo']
# 指定表
collection = db['weibo_domo2']
max_page = 100
# 存儲到mongoDB
def save_to_mongo(result):
if collection.insert(result):
print("saved to mongo")
# https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2
# 找到X-Requested-With: XMLHttpRequest的Ajax請求
# 基礎url,之后利用urlencode進行拼接
base_url = 'https://m.weibo.cn/api/container/getIndex?'
# https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474
headers = {
'host': 'm.weibo.cn',
# 手機端打開,查到鏈接,在解析
# 'Referer': 'https://m.weibo.cn/p/1005052830678474',
'Referer': 'https://m.weibo.cn/u/2202323951',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2202323951',
# 'containerid': '1076032830678474',
'containerid': '1076032202323951',
'page': page,
}
url = base_url + urlencode(params)
print(url)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# response = json.dump(response.text)
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page: int):
if json:
# 只需要data下的cards內的數據
items = json.get('data').get('cards')
# index 下標
for index, item in enumerate(items):
# 在第一頁,index==1沒有mblog,只有這個沒用,所以直接循環會導則索引報錯
# 跳過這段
if index == 1 and page == 1:
continue
else:
item = item.get('mblog')
weibo = {}
# 微博ID
# "id":"4349509976406880",
weibo['ID'] = item.get('id')
# 微博內容 使用pq去掉html標簽
weibo['text'] = pq(item.get('text')).text()
# 發表所用手機
weibo['phone'] = item.get('source')
# 發表時間
weibo['time'] = item.get('edit_at')
# 贊數量 attitudes:態度,意思,姿態
weibo['attitudes'] = item.get('attitudes_count')
# 評論數 comment:評論
weibo['comments'] = item.get('comments_count')
# 轉發數 repost:轉帖
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
for page in range(1, max_page + 1):
json = get_page(page)
# *json==*args 將返回的json和page傳入
results = parse_page(*json)
time.sleep(3)
for result in results:
print(result)
save_to_mongo(result)
總結:
1.不加延遲爬到36-38頁會出現418 (418 I’m a teapot 服務器拒絕嘗試用 “茶壺沖泡咖啡”。)
2. Ajax請求中可能在中間出現不是你想要的數據,例如微博page1,index1代表的是關注列表,關注的信息,不是你想要的數據
3.使用手機端獲取Ajax數據,比在PC端,容易很多.
4.啟動mongo需要先指定dbpath(數據存儲的地方),查詢插入文件的數量
形如:mongod --dbpath="F:\MongoDB\Server\3.4\data"
形如: db.weibo_domo2.find().count()
5.最終爬取出了朱子奇的所有微博,一共959條
006.貓眼電影top100
https://www.cnblogs.com/shuimohei/p/10400814.html
007.百度百科
https://www.cnblogs.com/shuimohei/p/10339891.html
008.斗魚直播
'''
Ajax含有很多加密參數,我們無法直接進行爬取,只能借助Selenium
'''
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import unittest
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
class douyu(unittest.TestCase):
# 初始化方法,必須是setUp()
def setUp(self):
# self.driver = webdriver.Chrome()
self.driver = webdriver.PhantomJS()
self.num = 0
self.count = 0
# 測試方法必須有test字樣開頭
def testDouyu(self):
self.driver.get("https://www.douyu.com/directory/all")
while True:
soup = bs(self.driver.page_source, "lxml")
# 房間名, 返回列表
names = soup.find_all("h3", {"class": "DyListCover-intro"})
# 直播間熱度, 返回列表
numbers = soup.find_all("span", {"class": "DyListCover-hot"})
print(names,numbers)
for name, number in zip(names, numbers):
self.num += 1
print(
u"直播間熱度: -" + number.get_text().strip() + u"-\t房間名: " + name.get_text().strip() + u'-\t直播數量' + str(
self.num))
result = u"直播間熱度: -" + number.get_text().strip() + u"-\t房間名: " + name.get_text().strip() + u'-\t直播數量' + str(
self.num)
with open('123.txt', 'a', encoding='utf-8') as f:
f.write(result)
# self.count += int(number.get_text().strip())
# 如果在頁面源碼里找到"下一頁"為隱藏的標簽,就退出循環
if self.driver.page_source.find("dy-Pagination-disabled dy-Pagination-next") != -1:
break
#網絡不好,加個延時,也可以考慮用直到標簽能夠點擊的判斷
time.sleep(1)
# 一直點擊下一頁
self.driver.find_element_by_class_name("dy-Pagination-next").click()
time.sleep(1)
# 測試結束執行的方法
def tearDown(self):
# 退出PhantomJS()瀏覽器
print("當前網站直播人數" + str(self.num))
print("當前網站總熱度" + str(self.count))
self.driver.quit()
if __name__ == "__main__":
# 啟動測試模塊
unittest.main()
selenium還是慢了點,加了延時后更慢了
009.陽光熱線問政平台
1.創建項目
scrapy startproject dongguan
2.創建爬蟲
scrapy genspider -t crawl sun wz.sun0769.com
3.items.py
import scrapy
class DongguanItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
data = scrapy.Field()
num = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()
4.sun.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
class SunSpider(CrawlSpider):
name = 'sun'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
rules = (
# 翻頁
Rule(LinkExtractor(allow=r'page=\d+'), follow=True),
# 每個鏈接的
Rule(LinkExtractor(allow=r'id=\d+'), callback='parse_item', follow=False),
)
def parse_item(self, response):
print(response.url)
print(response)
item = DongguanItem()
item['title'] = response.xpath('//p[@class="focus-details"]/text()').extract_first()
item['data'] = response.xpath('//span[@class="fl"]/text()').extract()[0][4:]
item['num'] = response.xpath('//span[@class="fl"]/text()').extract()[2][3:]
# normalize-space,xpath中去掉\r\t\n
item['content'] = response.xpath('normalize-space(//div[@class="details-box"]/pre/text())').extract_first()
item['url'] = response.url
yield item
5.pipelines.py
import json
class DongguanPipeline(object):
def __init__(self):
self.filename = open('dongguan.txt', 'wb')
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename.write(text.encode('utf-8'))
return item
def close_spider(self, spider):
self.filename.close()
6.settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'dongguan.pipelines.DongguanPipeline': 300,
}
# 日志文件名和處理等級
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
7.運行爬蟲
scrapy crawl sun
8.運行效果

010.新浪網分類資訊整站爬蟲
1.創建項目
scrapy startproject sina
2.創建爬蟲
scrapy genspider xinlang sina.com.cn
3.items.py
# -*- coding: utf-8 -*- import scrapy import sys, importlib importlib.reload(sys) class SinaItem(scrapy.Item): # 第一層:大類的標題 和 url parentTitle = scrapy.Field() parentUrls = scrapy.Field() # 第二層:小類的標題 和 子url subTitle = scrapy.Field() subUrls = scrapy.Field() # 存儲到本地:小類目錄存儲路徑 subFilename = scrapy.Field() # 第三層:小類下的子鏈接 sonUrls = scrapy.Field() # 抓到數據:文章標題和內容 head = scrapy.Field() content = scrapy.Field()
4.xinlang.py----新聞的解析方式太多了,沒有寫完全
# -*- coding: utf-8 -*-
import scrapy
# 創建文件夾
import os
from sina.items import SinaItem
class XinlangSpider(scrapy.Spider):
name = 'xinlang'
allowed_domains = ['sina.com.cn']
start_urls = ['http://news.sina.com.cn/guide/']
def parse(self, response):
items = []
# 用xpath找出所有大類的URL和標題 19個
parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
parentTitle = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
# 找出所有小類的ur 和 標題 299個
subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
subTitle = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
# 爬取所有大類
for i in range(0, len(parentTitle)):
# 指定大類目錄的路徑和目錄名
parentFilename = "./Data/" + parentTitle[i]
# 如果目錄不存在,則創建目錄
if (not os.path.exists(parentFilename)):
os.makedirs(parentFilename)
# 爬取所有小類
for j in range(0, len(subUrls)):
item = SinaItem()
# 保存大類的title和urls
item['parentTitle'] = parentTitle[i]
item['parentUrls'] = parentUrls[i]
# 檢查小類的url是否以同類別大類url開頭,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
if_belong = subUrls[j].startswith(item['parentUrls'])
# 如果屬於本大類,將存儲目錄放在本大類目錄下
if (if_belong):
subFilename = parentFilename + '/' + subTitle[j]
# 如果目錄不存在,則創建目錄
if (not os.path.exists(subFilename)):
os.makedirs(subFilename)
# 存儲 小類url、title和filename字段數據
item['subUrls'] = subUrls[j]
item['subTitle'] = subTitle[j]
item['subFilename'] = subFilename
items.append(item)
# 發送每個小類url的Request請求,得到Response連同包含meta數據 一同交給回調函數 second_parse 方法處理
for item in items:
yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
# 對於返回的小類的url,再進行遞歸請求
def second_parse(self, response):
# 提取每次Response的meta數據
meta_1 = response.meta['meta_1']
# 取出小類里所有子鏈接,只要a標簽下的鏈接
sonUrls = response.xpath('//a/@href').extract()
items = []
for i in range(0, len(sonUrls)):
# 檢查每個鏈接是否以大類url開頭、以.shtml結尾,如果是返回True,確保是個新聞
if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])
# 如果屬於本大類,獲取字段值放在同一個item下便於傳輸
if (if_belong):
item = SinaItem()
item['parentTitle'] = meta_1['parentTitle']
item['parentUrls'] = meta_1['parentUrls']
item['subUrls'] = meta_1['subUrls']
item['subTitle'] = meta_1['subTitle']
item['subFilename'] = meta_1['subFilename']
item['sonUrls'] = sonUrls[i]
items.append(item)
# 發送每個小類下子鏈接url的Request請求,得到Response后連同包含meta數據 一同交給回調函數 detail_parse 方法處理
for item in items:
yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
# 數據解析方法,獲取文章標題和內容
def detail_parse(self, response):
item = response.meta['meta_2']
content = ""
head = response.xpath('//h1[@class="main-title"]/text()').extract()
content_list = response.xpath('//div[@class="article"]/p/text()').extract()
# 如果新聞的類型沒有匹配到
if len(content_list) < 1:
# 按照新聞中心的匹配http://news.sina.com.cn/w/2004-12-20/11314575163s.shtml
head = response.xpath('//th[@class="f24"]//h1/text()').extract()
content_list = response.xpath('//td[@class="l17"]/font/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2012-09-21/092225223127.shtml
head = response.xpath('//div[@class="blk_content"]/h1/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@id="artibodyTitle"]/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@class="main-title"]/text()').extract()
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
if len(content_list) < 1:
# http://news.sina.com.cn/c/2014-09-24/145630907684.shtml
head = response.xpath('//h1[@id="artibodyTitle"]/font/text()').extract()
content_list = response.xpath('//div[@id="artibody"]//span/text()').extract()
if len(head) < 1:
# 漏網只魚
head = ['error']
content_list = [response.url]
# 將p標簽里的文本內容合並到一起
for content_one in content_list:
content += content_one
item['head'] = head
item['content'] = content
yield item
5.pipelines.py
import json
from scrapy import signals
class SinaPipeline(object):
def process_item(self, item, spider):
sonUrls = item['sonUrls']
# 文件名為子鏈接url中間部分,並將 / 替換為 _,保存為 .txt格式
filename = sonUrls[7:-6].replace('/', '_')
filename += ".txt"
fp = open(item['subFilename'] + '/' + filename, 'w', encoding='utf-8')
fp.write(item['content'])
fp.close()
return item
6. settings.py
BOT_NAME = 'sina'
SPIDER_MODULES = ['sina.spiders']
NEWSPIDER_MODULE = 'sina.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
ITEM_PIPELINES = {
'sina.pipelines.SinaPipeline': 300,
}
# 日志文件名和處理等級
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
7.main.py
在項目根目錄下新建main.py文件,用於調試
from scrapy import cmdline
cmdline.execute('scrapy crawl xinlang'.split())
8.執行程序
運行main.py文件即可
9.效果
能爬一部分新聞,不夠完善
請求成功次數:4416
最大深度:2
