1. start_urls -- 起始URL 的內部實現(將迭代器轉換為生成器)
class QSpider(scrapy.Spider): name = 'q' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/'] def start_requests(self) # 方式一: for url in self.start_urls: yield Request(url=url) # 方式二: req_list = [] for url in self.start_urls: req_list.append(Request(url=url)) return req_list
2. 深度和優先級
深度: - 最開始是0 - 每次 yield 時,會根據原來請求中的 depth + 1 - 配置:DEPTH_LIMIT 深度控制 優先級: - 請求被下載的優先級 -= 深度 * 配置 DEPTH_PRIORITY - settings配置:DEPTH_PRIORITY
3. 下載中間件
scrapy 中設置代理( 兩個單獨的程序之間的 環境變量是不共享的,因為是兩個進程 ) - 內置: 在爬蟲啟動時,提前在 os.envrion 中設置代理 # 方式一( 設置環境變亮 ): import os os.environ["HTTPS_PROXY"] = "https://root:xx@1.1.1.1:80" os.environ["HTTP_PROXY"] = "1.1.1.2" # 方式二( 設置參數 ): yield Request(url=url, callback=self.parse, meta={'proxy':'https://root:xx@1.1.1.1:80'}) - 自定義: 1. settings: DOWNLOADER_MIDDLEWARES = { #'xdb.middlewares.XdbDownloaderMiddleware': 543, 'xdb.proxy.XdbProxyMiddleware':751, } 2. proxy.py class DdbProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''}, ] proxy = random.choice(PROXIES) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) else: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])