python scrapy入門(一)-通過xpath獲取數據保存到json,csv,mysql


1.下載包

pip install scrapy

2.在使用路徑終端上創建項目指令: scrapy startproject 項目名

爬蟲文件名和爬蟲名稱不能相同,spiders目錄內不能存在相同的爬蟲名稱的項目文件

scrapy startproject maoyan

cd maoyan

scrapy genspider maoyan https://www.maoyan.com/

創建后目錄大致頁如下

|-ProjectName #項目文件夾

|-ProjectName #項目目錄

|-items.py #定義數據結構

|-middlewares.py #中間件

|-pipelines.py #數據處理

|-settings.py #全局配置

|-spiders

|-__init__.py #爬蟲文件

|-maoyan.py

|-scrapy.cfg #項目基本配置文件

3.settings設置如下: 

# 項目名
BOT_NAME = 'maoyan'

SPIDER_MODULES = ['maoyan.spiders']
NEWSPIDER_MODULE = 'maoyan.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'

# ROBOTSTXT_OBEY:是否遵循機器人協議,默認是true,需要改為false
ROBOTSTXT_OBEY = False

# CONCURRENT_REQUESTS:最大並發數,同時允許開啟多少個爬蟲線程
#CONCURRENT_REQUESTS = 32

# 下載延遲時間,單位是秒,控制爬蟲爬取的頻率
DOWNLOAD_DELAY = 3

# DEFAULT_REQUEST_HEADERS:默認請求頭,上面寫了一個USER_AGENT,這個東西就是放在請求頭里面的,可以根據你爬取的內容做相應設置。
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

#ITEM_PIPELINES:項目管道,300為優先級,越低越爬取的優先度越高
ITEM_PIPELINES = {
   'myfirstPj.pipelines.MyfirstpjPipeline': 300,
}
#比如pipelines.py里面寫了兩個管道,一個爬取網頁的管道,一個存數據庫的管道,調整他們的優先級,如果有爬蟲數據,優先執行存庫操作。

#編碼格式 , 不設置的話json就會亂碼
FEED_EXPORT_ENCODING = 'utf-8'
USER_AGENT 在瀏覽器中可看到:

 4.在items.py上編寫需要抓取的內容

 

import scrapy

class MaoyanItem(scrapy.Item):
    move_name=scrapy.Field()
    peaple_name=scrapy.Field()
    move_time = scrapy.Field()
    describe= scrapy.Field()

 5.創建一個maoyan_spider.py文件

 

 

 6.在 maoyan_spider.py上編寫 

 

 

 

 將復制出來的一大行的(包括電影名人名時間的)xpath先寫起來,再復制下一行的xpath,再寫起來,對比就可以發現規律了,通過循環每一大行,再細定位就可以了,細定位同樣的單獨復制兩個電影名就可以發現其中的區別了

這是一大行的定位: //*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1],//*[@id="app"]/div/div/div[1]/dl/dd[2]/div/div/div[1]可以看出dd[]是變量,所以在它前面的都是固定的,用//替換dd[],就可以得到一整頁的行了,move_list=//*[@id='app']/div/div/div[1]/dl//div/div/div[1]

電影名為//*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[1]/a ,時間為://*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[3],可以看出在for循環每一行里面共用的開頭那部分就是剛剛定位的一大行move_list的可以用" . "表示,".//p[1]/a",".//p[3]"

當然你也可以直接看網頁中的標簽自己寫出來,方法不唯一

aaa=d.xpath(".//p[1]/a").extract_first() # 這是輸出的第一個數據

輸出之后接下來就是字符串的處理了。

這里沒有下一頁的功能,這里沒有下一頁的功能,大部分網頁翻頁功能一般方法:看網址是否有變量比如10,20,或者點擊下一頁按鈕,同樣也可以看request請求的變量。

 

import scrapy
from maoyan.items import MaoyanItem

class MaoyanSpiderSpider(scrapy.Spider):
    name = 'maoyan_spider' #項目名
    allowed_domains = ['maoyan.com'] #域名
    #url入口
    start_urls = ['https://www.maoyan.com/board/4?timeStamp=1638539026134&channelId=40011&index=1&signKey=a675982b76014e4a8b7f3beb5afe7441&sVersion=1&webdriver=false']

    def parse(self, response):
        # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div
        move_list = response.xpath("//*[@id='app']/div/div/div[1]/dl//div/div/div[1]")
        for d in move_list:
            maoyan_item=MaoyanItem()## 初始化item對象保存爬取的信息
            # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[1]/a ----------name
            # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[3] ------------time
            # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[2] ------------peaple
            n_list= []
            p_list= []
            # 電影名
            aaa=d.xpath(".//p[1]/a").extract_first().split('"') # 切割第一個數據
            for aa in aaa:
                n_list.append(aa)
            maoyan_item['move_name'] = n_list[3]
            # 主演
            bbb=d.xpath(".//p[2]").extract_first().split('\n')
            for bb in bbb:
                p_list.append(bb)
            maoyan_item['peaple_name'] = p_list[1].replace('主演:','').strip()
            # 上映時間
            move_time1 = d.xpath(".//p[3]").extract()
            for t in move_time1:
                ccc=re.search(r"(\d{4}-\d{1,2}-\d{1,2})",t).group(0)
                maoyan_item['move_time'] =ccc
            print(maoyan_item)
            yield maoyan_item   # 提交到調度器

7.手動創建一個main.py,用來運行的,也可以用指令

from scrapy import cmdline
cmdline.execute('scrapy crawl maoyan_spider'.split())

8.運行main.py 

9.存儲方式:json,csv,mysql

 1)保存到json——注意路徑

scrapy crawl maoyan_spider -o test.json

 

 2)保存到csv——注意路徑

scrapy crawl maoyan_spider -o test.csv

 

3)保存到數據庫mysql

數據庫自己先建好表和字段,這里是直接插入數據的

#settings.py
# mongo_host='192.168.x.xxx'
# mongo_post=27017
# mongo_db_name='maoyan'
# mongo_db_collection='maoyan_movie'

MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'maoyan_sql'
MYSQL_USER = 'root'
MYSQL_PASSWD = '1234'

 

#pipelines.py
# mongo_host='192.168.x.xxx'
# mongo_post=27017
# mongo_db_name='maoyan'
# mongo_db_collection='maoyan_movie'
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'maoyan_sql'
MYSQL_USER = 'root'
MYSQL_PASSWD = '1234'#pipelines.py
from itemadapter import ItemAdapter
import pymysql
from sqlalchemy import *
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
#連接數據庫
from maoyan import settings
def dbHandle():
    conn = pymysql.connect(
        host = "localhost",
        user = "root",
        passwd = "1234",
        charset = "utf8mp4",
        use_unicode = False
    )
    return conn
class MaoyanPipeline:
    def __init__(self):
        # 連接數據庫
        self.connect = pymysql.connect(
            host=settings.MYSQL_HOST,
            db=settings.MYSQL_DBNAME,
            user=settings.MYSQL_USER,
            passwd=settings.MYSQL_PASSWD,
            charset='utf8',
            use_unicode=True)
        # 通過cursor執行增刪查改
        self.cursor = self.connect.cursor()
    def process_item(self, item, spider):
        try:
            # 插入數據
            self.cursor.execute(
                """insert into move(move_name,peaple_name,move_time) value (%s, %s, %s)""",
                (item['move_name'],
                 item['peaple_name'],
                 item['move_time']))
            # 提交sql語句
            self.connect.commit()
        except BaseException as e:
            # 出現錯誤時打印錯誤日志
            print("error:------------", e, "-----------------")
        return item
#dbmongo部分參考
    # def __init__(self):
    #     host=mongo_host
    #     post=mongo_post
    #     dbname=mongo_db_name
    #     sheetname=mongo_db_collection
    #     client=pymongo.MongoClient(host=host,post=post)
    #     mydb=client[dbname]
    #     self.post=mydb[sheetname]#讀寫操作
    # def process_item(self, item, spider):
    #     data=dict(item)#先轉字典,再數據插入
    #     self.post.insert(data)
    #     return item
# # class HellospiderPipeline(object):
#     def process_item(self, item, spider):
#         dbObject = dbHandle()
#         cursor = dbObject.cursor()
#         cursor.execute("USE maoyan_sql")
#         #插入數據庫
#         sql = "INSERT INTO move(move_name,peaple_name,move_time) VALUES(%s,%s,%s)"
#         try:
#             cursor.execute(sql,
#                            ( item['move_name'], item['peaple_name'], item['move_time']))
#             cursor.connection.commit()
#         except BaseException as e:
#             print("錯誤在這里>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<")
#             dbObject.rollback()
#         return item

 

 

 

數據庫中查看如下:

 

結尾:#以下僅供參考:

#middlewares.py
#定義ip代理中間件 import base64 class my_proxy(object): def process_request(self,request,spider): request.meta['proxy']='http-xxx.com:端口號' proxy_name_pass=b'用戶名:密碼' encode_pass_name=base64.b64encode(proxy_name_pass)#加密 request.headers['proxy-Authorization']='Basic '+encode_pass_name.decode()
#中間件定義之后一定要在settings文件內啟用
#settings.py DOWNLOADER_MIDDLEWARES = { # 'maoyan.middlewares.MaoyanDownloaderMiddleware': 543, 'maoyan.middlewares.my_proxy': 543 }
#middlewares.py #定義useragent中間件 class my_useragent(object): def process_request(self,request,spider): USER_AGENT_LIST=[百度一下就有] agent=random.choice(USER_AGENT_LIST) request.headers['User_Agent']=agent #settings.py優先級不能相同 DOWNLOADER_MIDDLEWARES = { 'maoyan.middlewares.my_proxy': 543, 'maoyan.middlewares.my_uesragent': 544, }


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM