百度相關搜索關鍵詞抓取,讀取txt關鍵詞,導出txt關鍵詞
#百度相關搜索關鍵詞抓取,讀取txt關鍵詞,導出txt關鍵詞 # -*- coding=utf-8 -*- import requests import re import time from multiprocessing.dummy import Pool as ThreadPool #百度相關關鍵詞查詢 def xgss(url): headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36" } html=requests.get(url,headers=headers).text #print(html) ze=r'<div id="rs"><div class="tt">相關搜索</div><table cellpadding="0">(.+?)</table></div>' xgss=re.findall(ze,html,re.S) #print(xgss) xgze=r'<th><a href="(.+?)">(.+?)</a></th>' sj=re.findall(xgze,str(xgss),re.S) #print(sj) gjc='' for x in sj: print(x[1]) gjc=gjc+x[1]+'\n' # 導出關鍵詞為txt文本 with open(".\gjcsj.txt", 'a', encoding='utf-8') as f: f.write(gjc) print("-----------------------------------") return gjc print("程序運行,正在導入關鍵詞列表!!!") print("-----------------------------------") # 導入要搜索的關鍵詞txt列表 urls = [] data = [] for line in open('.\gjc.txt', "r", encoding='utf-8'): data.append(line) print("導入關鍵詞列表成功!") print("-----------------------------------") #轉換關鍵詞為搜索鏈接 for keyword in data: url = 'https://www.baidu.com/s?wd=' + keyword urls.append(url) print("采集百度相關搜索關鍵詞開啟!") print("...................") #多線程獲取相關關鍵詞 try: # 開4個 worker,沒有參數時默認是 cpu 的核心數 pool = ThreadPool() results = pool.map(xgss, urls) pool.close() pool.join() print("采集百度相關搜索關鍵詞完成,已保存於gjcsj.txt!") except: print("Error: unable to start thread") print("8s后程序自動關閉!!!") time.sleep(8)