# -*- coding: utf-8 -*-
# Scrapy settings for GitHub project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# Scrapy項目的名字,這將用來構造默認 User-Agent,同時也用來log,當您使用 startproject 命令創建項目時其也被自動賦值。
BOT_NAME = 'GitHub'
# Scrapy搜索spider的模塊列表 默認: [xxx.spiders]
SPIDER_MODULES = ['GitHub.spiders']
# 使用 genspider 命令創建新spider的模塊。默認: 'xxx.spiders'
NEWSPIDER_MODULE = 'GitHub.spiders'
# 日志的設置(日志的級別)
# OFF、FATAL、ERROR、WARN、INFO、DEBUG、ALL六個(由高到低)
# 還有4個的比較簡單(ERROR、WARN、INFO、DEBUG)(由高到低)
LOG_LEVEL = "INFO"
# 如果不想在控制台打印,就可以存放在一個文件里
LOG_FILE = "GitHub.log"
# 設置User-Agent(常用)
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
# 設置是否遵守robot協議(常用而且一般是不遵守的)
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Scrapy downloader設置最大並發數(默認是16個,可以自己設置更多。但是要注意電腦的性能)
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 設置延遲 (批量的)例如有16個線程,那是16個請求之后休息一段時間。而不是每一個休息一段時間
# 下載器在下載同一個網站下一個頁面前需要等待的時間,該選項可以用來限制爬取速度,減輕服務器壓力。同時也支持小數:0.25 以秒為單位
#DOWNLOAD_DELAY = 3
# 和上方設置最大並發數是一樣功能的設置,只能有一個起作用(下載延遲設置只能有一個有效)
# The download delay setting will honor only one of:
# 對單個網站進行並發請求的最大值。
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 對單個IP進行並發請求的最大值。如果非0,則忽略 CONCURRENT_REQUESTS_PER_DOMAIN 設定,使用該設定。 也就是說,並發限制將針對IP,而不是網站。該設定也影響 DOWNLOAD_DELAY: 如果 CONCURRENT_REQUESTS_PER_IP 非0,下載延遲應用在IP而不是網站上
#CONCURRENT_REQUESTS_PER_IP = 16
# 禁用cookie(默認是啟用), 意思是禁用cookie
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# 禁用telnet控制台(默認啟用)
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# 覆蓋默認的請求頭
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 爬蟲中間件
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'GitHub.middlewares.MyCustomSpiderMiddleware': 543,
#}
# 下載中間件
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'GitHub.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# 啟用或者禁用擴展
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# 管道
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'GitHub.pipelines.SomePipeline': 300,
#}
# 啟用和配置AutoThrottle擴展(默認情況下禁用)
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# 初始下載延遲
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# 在高延遲情況下設置的最大下載延遲
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# Scrapy平均請求數應與每個遠程服務器並行發送
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# 啟用顯示收到的每個響應的限制狀態:
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# 啟用和配置HTTP緩存(默認情況下禁用)
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'