Python多線程采集百度相關搜索關鍵詞


    百度相關搜索關鍵詞抓取,讀取txt關鍵詞,導出txt關鍵詞

  

#百度相關搜索關鍵詞抓取,讀取txt關鍵詞,導出txt關鍵詞
 
# -*- coding=utf-8 -*-
import requests
import re
import time
from multiprocessing.dummy import Pool as ThreadPool
 
 
#百度相關關鍵詞查詢
def xgss(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    html=requests.get(url,headers=headers).text
    #print(html)
    ze=r'<div id="rs"><div class="tt">相關搜索</div><table cellpadding="0">(.+?)</table></div>'
    xgss=re.findall(ze,html,re.S)
    #print(xgss)
    xgze=r'<th><a href="(.+?)">(.+?)</a></th>'
    sj=re.findall(xgze,str(xgss),re.S)
    #print(sj)
    gjc=''
    for x in sj:
        print(x[1])
        gjc=gjc+x[1]+'\n'
 
    # 導出關鍵詞為txt文本
    with open(".\gjcsj.txt", 'a', encoding='utf-8') as f:
        f.write(gjc)
    print("-----------------------------------")
    return gjc
 
 
print("程序運行,正在導入關鍵詞列表!!!")
print("-----------------------------------")
# 導入要搜索的關鍵詞txt列表
urls = []
data = []
for line in open('.\gjc.txt', "r", encoding='utf-8'):
    data.append(line)
print("導入關鍵詞列表成功!")
print("-----------------------------------")
 
#轉換關鍵詞為搜索鏈接
for keyword in data:
    url = 'https://www.baidu.com/s?wd=' + keyword
    urls.append(url)
 
print("采集百度相關搜索關鍵詞開啟!")
print("...................")
#多線程獲取相關關鍵詞
try:
    # 開4個 worker,沒有參數時默認是 cpu 的核心數
    pool = ThreadPool()
    results = pool.map(xgss, urls)
    pool.close()
    pool.join()
    print("采集百度相關搜索關鍵詞完成,已保存於gjcsj.txt!")
except:
    print("Error: unable to start thread")
 
print("8s后程序自動關閉!!!")
time.sleep(8)

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM