Scrapy爬取多層網頁結構數據

本文轉載自查看原文 2020-04-28 10:25 694 爬蟲

Scrapy爬取多層網頁結構數據:
- Item.py
SunmoiveSpider.py:
pipelines.py:

Scrapy爬取多層網頁結構數據:

Item.py

from scrapy import Field,Item
class SunmoiveItem(Item):    
    cate_url = Field()
    cate_name=Field()
    cate_url_list = Field()
    moive_url=Field()
    moive_name=Field()
    moive_source=Field()

SunmoiveSpider.py:

import sys
reload(sys)
sys.path.append('..')
sys.setdefaultencoding('utf-8')
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from items import SunmoiveItem
from bs4 import BeautifulSoup as bs
import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Connection': 'keep-alive'
}

class SunmoiveSpider(CrawlSpider):
    name ='sunmoivespider'
    start_urls=['http://www.ygdy8.com']
    allowed_domains = ["ygdy8.com"]
    #parse 函數用於解析首頁 獲得每個分類的url 每個分類的名稱
    def parse(self,response):
        #先定義一個空列表 存儲大類的數據 然后meta參數傳遞給下一層
        items_1=[]
        selector=Selector(response)
        infos = selector.xpath('//div[@class="contain"]/ul/li[position()<12]')
        for info in infos:
            #在循環里對item進行實例化 類型為字典
            item = SunmoiveItem()
            cate_url = response.url + info.xpath('a/@href')[0].extract()
            cate_name = info.xpath('a/text()')[0].extract()
            # items.py中field()第一個字段
            item['cate_url']=cate_url
            # items.py中field()第二個字段
            item['cate_name'] = cate_name.encode('utf8')
            items_1.append(item)
        #此時列表items_1添加了所有獲取到的分類cate_url和cate_name所有的元素是字典，每個元素是{'cate_url':'url的連接','cate_name'：獲取到的分類名稱}
        for item in items_1:
            #對列表遍歷，回調parse_item函數 請求的是每個cate_url meta將這一層的數據傳遞到下一層
            yield Request(url=item['cate_url'], meta={'item_1': item}, callback=parse_item)

 def parse_item(self,response):
        #這里item_1接收上一層的數據
        item_1 = response.meta['item_1']
        #再次定義空列表 用來保存上一層數據和本層數據
        items=[]
        #response.url 為上一層解析得到的cate_url
        res = requests.get(response.url, headers=headers)
        res.encoding = 'gb2312'
        html = res.text.encode('utf-8')
        #解析找到兩個參數 分類id 和總頁數
        reg1 = r'共(.*?)頁/.*?條記錄'
        reg2 = r'<option value=\'(list_.*?_).*?'
        num1 = re.findall(reg1, html)
        num2 = re.findall(reg2, html)
        if len(num1) > 0:
            #response.url 為 'http://www.ygdy8.com/html/gndy/oumei/index.html'
            #每個分類分頁url格式為 http://www.ygdy8.com/html/tv/hytv/list_7_66.html
            detail_url = response.url.rstrip(response.url.split('/')[-1]) + str(num2[0])
            #對總頁數循環 得到每個分類分頁url
            ##  http://www.ygdy8.com/html/tv/hytv/list_7_1.html、http://www.ygdy8.com/html/tv/hytv/list_7_2.html、、、、
            for page in range(1, int(num1[0]) + 1):
                #再次將item實例化 現在item里已經有上一層的數據 現在需要把這一層的數據添加進去
                item=SunmoiveItem()
                cate_url_list = detail_url + str(page) + '.html'
                if requests.get(cate_url_list, headers=headers).status_code == 200:
                    # 添加items.py中field()第三個字段
                    item['cate_url_list']=cate_url_list
                    #將上一層數據item_1字典里的傳遞 目前數據包含3個字段了 cate_url,cate_name,cate_url_list
                    #傳遞賦值接收過來的上一層數據
                    item['cate_url']=item_1['cate_url']
                    item['cate_name'] = item_1['cate_name']
                    items.append(item)
        for item in items:
            # 對列表遍歷，回調parse_detail函數 進入下一層url 請求的是每個cate_url_list meta將前兩層的數據傳遞到詳情頁
            yield Request(url=item['cate_url_list'], meta={'item_2':item},callback=self.parse_detail)

 #電影詳情頁的解析
    def parse_detail(self,response):
        #接收前兩層數據
        item = response.meta['item_2']
        res = requests.get(response.url)
        res.encoding = 'gb2312'
        html = res.text
        soup = bs(html, 'html.parser')
        contents = soup.select('.co_content8 ul')[0].select('a')
        count = len(contents)
        print response.url, count
        for title in contents:
            print count
            moive_name = title.text.encode('utf-8')
            moive_url = "http://www.ygdy8.com/" + title['href']
            res = requests.get(moive_url)
            res.encoding = 'gb2312'
            html = res.text
            soup = bs(html, 'html.parser')
            moive_sources = soup.select('#Zoom span tbody tr td a')
            for source in moive_sources:
                item['moive_source']=source['href']
                item['moive_url']=moive_url
                item['moive_name']=moive_name.encode('utf8')
                print item['moive_name'],item['moive_url'],item['moive_source']
                count-=1
                yield item

pipelines.py:

（先在數據庫建表）:
  import MySQLdb
def dbHandle():
    conn = MySQLdb.connect(
        host = "數據庫ip",
        user = "數據庫登陸用戶",
        passwd = "密碼",
        charset = "utf8",
        use_unicode = False
    )
    return conn

class SunmoivePipeline(object):
    def process_item(self, item, spider):
        dbObject = dbHandle()
        cursor = dbObject.cursor()
        cursor.execute("USE local_db")
        sql = "INSERT INTO sunmoive VALUES(%s,%s,%s,%s,%s,%s)"
        try:
            cursor.execute(sql, (item['cate_name'], item['cate_url'], item['cate_url_list'],item['moive_url'],item['moive_name'], item['moive_source']))
            cursor.connection.commit()
        except BaseException as e:
            print("錯誤在這里>>>>", e, "<<<<<<錯誤在這里")
            dbObject.rollback()
        return item
    
    
<4>seitings.py
#coding:utf-8
BOT_NAME = 'sunmoive'
SPIDER_MODULES = ['sunmoive.spiders']
NEWSPIDER_MODULE = 'sunmoive.spiders'
ROBOTSTXT_OBEY = False
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
ITEM_PIPELINES = {
   'sunmoive.pipelines.SunmoivePipeline': 300,
}


<5>main.py(pycharm里啟動爬蟲 和cmd命令功能一樣）
#coding:utf-8
from scrapy import cmdline
cmdline.execute("scrapy crawl sunmoivespider".split())

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Scrapy爬取多層級網頁內容的方式 Scrapy 實現爬取多頁數據 + 多層url數據爬取 Scrapy 實現爬取多頁數據多層url數據爬取 meta傳遞鏈接為相對路徑 scrapy基礎之數據爬取 scrapy圖片數據爬取用scrapy爬取京東的數據 Scrapy全站數據爬取 python3下scrapy爬蟲(第八卷:循環爬取網頁多頁數據）如何提升scrapy爬取數據的效率 scrapy爬取的數據異步存儲至MySQL