python爬取百度谷歌搜索結果



使用requests模塊爬取百度或者谷歌搜索結果,,如下代碼示例是百度的,修改為谷歌的話研究下谷歌url的格式替換下即可,
把要搜索的字段寫入一個文件中,每行寫一個,運行的第一個參數為文件路徑,按代碼中的保存格式將結果保存在當前目錄的文件中;
代碼如下

# coding=utf-8

import os
import random
import sys
import time
import json
import logging
import datetime
import requests


logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s')

USER_AGENT = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0',
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']


class GoogleSpider:

def __init__(self, query_file):

self.query_file = query_file
self.temp_url = "https://www.baidu.com/s?wd={}"
self.query_list = []
self.url_list = []
user_agent = random.choice(USER_AGENT)
self.headers = {
"User-Agent": user_agent}
self.save_list = []

def get_query(self):
"""
從文件中查找要搜索的字段
:return:
"""

if not os.path.exists(self.query_file):
logging.error("請檢查文件名路徑")

with open(self.query_file, "r", encoding="utf-8") as file:
for word in file:
self.query_list.append(word.strip())

def get_url_list(self):
"""
獲取所有要搜索的url
:return:
"""

self.url_list = [self.temp_url.format(query) for query in self.query_list]

def parse_url(self):
"""
解析每一個url,每個請求停頓一秒,防止被識別為爬蟲
:return:
"""

for url in self.url_list:
word = self.query_list[self.url_list.index(url)]

response = requests.get(url, headers=self.headers)

if response.status_code != 200:
logging.error("{}搜索請求失敗".format(word))

save_format = dict()
save_format["query"] = word
save_format["html"] = response.content.decode()
save_format["datatime"] = datetime.datetime.now().strftime('%Y%m%d')

self.save_list.append(save_format)
time.sleep(1)

def write_to_file(self):
"""
將讀取的內容按照特定格式保存至文件中
:return:
"""

with open("success_query.txt", "w", encoding="utf-8") as file:
for content in self.save_list:
file.write(str(content))
file.write("\n")

logging.info("請在當前目錄下查看success_query.txt")

def run(self):

# 從query文件中讀取要查詢的字段
self.get_query()

# 獲取url列表
self.get_url_list()

# 發送請求獲取數據
self.parse_url()

# 將數據寫入文件中
self.write_to_file()


if __name__ == '__main__':
try:
query = sys.argv[1]
google = GoogleSpider(query)
google.run()

except IndexError:
logging.error("未找到查找目錄")

except Exception as e:

logging.error(e)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM