本源碼僅供測試,發包有風險,優化還是踏實的好!本代碼是本人自己學習python練手作品!
附上代碼:
# -*- coding: utf-8 -*-from selenium import webdriver import time import requests import random import os from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import traceback import urllib.request import pymysql import socket #import win32api #pip install pypiwin32 #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities #DesiredCapabilities.INTERNETEXPLORER['ignoreProtectedModeSettings'] = True #rasdial 寬帶連接 19ab68----643534 def connect(): cmd_str = "rasdial %s %s %s" % (g_adsl_account['name'], g_adsl_account['username'], g_adsl_account['password']) os.system(cmd_str) time.sleep(5) #"rasdial 斷開寬帶連接 /disconnect" def disconnect(): cmd_str = "rasdial %s /disconnect" % g_adsl_account['name'] os.system(cmd_str) time.sleep(5) #獲取ip地址 def get_ip(): #return ['ip','address'] fp = urllib.request.urlopen("http://ip.chinaz.com/getip.aspx") mybytes = fp.read() # note that Python3 does not read the html code as string # but as html code bytearray, convert to string with mystr = mybytes.decode("utf8") fp.close() ip = mystr.find("ip") add = mystr.find("address") ip = mystr[ip+4:add-2] address = mystr[add+9:-2] return [ip,address] #將ip地址插入數據庫 def insert_db(ipdate): #try: #獲取一個數據庫連接,注意如果是UTF-8類型的,需要制定數據庫 conn=pymysql.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8') cur=conn.cursor() #獲取一個游標對象 #cur.execute("CREATE DATABASE zongzong") #執行對應的SQL語句 #exit() cur.execute("USE zongzong") #exit() #cur.execute("CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT '',`url` varchar(256) DEFAULT '',`error` varchar(64) DEFAULT '',`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;") #插入數據 ISOTIMEFORMAT='%Y-%m-%d %X' ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() )) cur.execute("INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)",ipdate) #cur.execute("SELECT * FROM ip_log") #data=cur.fetchall() #print(data) cur.close()#關閉游標 conn.commit()#向數據庫中提交任何未解決的事務,對不支持事務的數據庫不進行任何操作 conn.close()#關閉到數據庫的連接,釋放數據庫資源 #except: # print("發生異常") #獲取搜素出來的url def get_search_url(driver): urls = [] real = [] real_url = [] click_link = [] content = driver.find_element_by_css_selector("div[id=\"content_left\"]") links = content.find_elements_by_tag_name("a") for link in links: if link.get_attribute('class') == "c-showurl": real.append(link.text) url = link.get_attribute('href') urls.append(url) #解密url header = requests.head(url).headers is_append = True for out_url in out_urls: if out_url in header['location']: is_append = False break if is_append == True: real_url.append(header['location']) #a標簽對象 click_link.append(link) #print(real) #print(urls) #return urls return [real_url,click_link] #function:解析加密url,剔除競爭對手的url # def get_real_url(urls): # real_url = [] # for url in urls: # header = requests.head(url).headers # is_append = True # for out_url in out_urls: # if out_url in header['location']: # is_append = False # break # if is_append == True: # real_url.append(header['location']) # return real_url #function 目標地址是否在某個list中 def get_urlIndex(tagurl,urls): i = 0 has = -1 for url in urls: if tagurl in url: has = True return i i = i+1 return has #點擊百度搜索內容下面的下一頁 def click_nextBtn(driver): div = driver.find_element_by_css_selector("div[id=\"page\"]") a = div.find_elements_by_tag_name("a") for item in a: print(item.text) if item.text == "下一頁>": item.click() return driver #隨機點擊 def click_search_url(driver,items): urls = [] real = [] content = driver.find_element_by_css_selector("div[id=\"content_left\"]") links = content.find_elements_by_tag_name("a") i=0 '''獲取當前窗口''' nowhandle = driver.current_window_handle #allhandles=driver.window_handles #for handle in allhandles: # print('....當前窗口....',handle.title) #exit() for link in links: if link.get_attribute('class') == "c-showurl": if i in items: print("隨機點擊item:",i) print(link.get_attribute('href'),link.text) #exit() link.click() #停留在點擊頁面 time.sleep(random.randint(5,10)) '''獲取所有窗口''' allhandles=driver.window_handles #for handle in allhandles: # print('....當前窗口....',handle.title) #exit() '''循環判斷窗口是否為當前窗口''' for handle in allhandles: if handle != nowhandle: print("切換到當前窗口") driver.switch_to_window(handle) print("title:",driver.title) '''關閉當前窗口''' driver.close() '''回到原先的窗口''' print("切換到原來的窗口") driver.switch_to_window(nowhandle) print("title:",driver.title) print("本次隨機點擊完畢!") i=i+1 #獲取隨機點擊的搜索頁random.randint(0 def get_random_index(index,len): if index >= 8: random_index = [ random.randint(0,4),random.randint(5,8) ] elif index>=4: random_index = [ random.randint(0,3),random.randint(3,index) ] elif index>=0: random_index = [ index ] elif index == -1: if len <=5: random_index = [ random.randint(0,5) ] else: random_index = [ #random.randint(0,4),random.randint(5,len) random.randint(5,len) ] return random_index def getUA(): uaList = [ #360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", #chrome "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", #"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", #firefox #"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0", #ie11 #"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", #ie8 #"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)", #2345王牌 #"Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018", #搜狗 #"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0", #opera "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60" ] headers = random.choice(uaList) return headers #屏幕瀏覽器窗口大小 def getWindowSize(): wind_size = [ [1920,1080], [1600,900], [1280,720] ] headers = random.choice(wind_size) return headers #屏幕分辨率設置 def setDisplay(): display_size = [ [1920,1080], [1680,1050], [1600,900], [1440,900], [1400,1050] ] d_size = random.choice(display_size) dm = win32api.EnumDisplaySettings(None, 0) dm.PelsWidth = d_size[0] dm.PelsHeight = d_size[1] dm.BitsPerPel = 32 dm.DisplayFixedOutput = 0 win32api.ChangeDisplaySettings(dm, 0) #撥號 19ab68----643534 g_adsl_account = { "name":"寬帶連接", "username":"19ab68", "password":"643534" } #屏蔽點擊的地址(競爭對手) out_urls = [ 'zhimo.yuanzhumuban.cc', 'bbs.yuanzhumuban.cc', 'http://money.163.com/15/0416/11/ANANRECC00253B0H.html' ] ##內頁詞 targetURL = [ ['http://www.hkuws.com','注冊離岸公司'], ['zs.efu.com.cn/mornfeeit/','夢菲雪'], ['zs.efu.com.cn/chengshijiaren/','城市佳人'], ['www.kidsnet.cn/exposition','童裝展會'], #['top.kidsnet.cn/','童裝加盟排行榜'], #['www.nynet.com.cn/','內衣網'], #['www.nzw.cn/','女裝網'], ['zs.efu.com.cn/ks/','卡索'], ['zs.efu.com.cn/distin-kidny/','迪斯廷凱'], ['zs.efu.com.cn/fuzhuang/luyidigao/','路易迪高童裝代{過}{濾}理'], ['brand.efu.com.cn/brandshow-1221090.html','凱帝龍馳'], ['zs.efu.com.cn/rabbitjero/','兔子傑羅'], ['zs.efu.com.cn/wmprince/','西瓜王子'], ['zs.efu.com.cn/betu','百圖'], ['zs.efu.com.cn/pepco/','小豬班納'], #['http://news.ifeng.com/a/20160518/48795120_0.shtml','華夏信財'], ['http://weibo.com/huaxiafinance','華夏信財'], ['http://p2p.hexun.com/2016-04-26/183531215.html','華夏信財'], #['http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm','華夏信財'], ['http://www.xcf.cn/gdyw/201605/t20160526_772682.htm','華夏信財'], ['http://www.huaxiaoxia.com/','華夏信財'], #['https://lc.huaxiafinance.com/','華夏信財'], ['so.tedu.cn','網絡營銷培訓機構'], ['www.cosatto.net.cn','個性安全座椅'], ['www.kaihuata.com/','開化旅游'], #['www.kaihuata.com/','開化'], ] for targetInfo in targetURL: try: #更換ip disconnect() connect() while(1): try: socket.gethostbyname("baidu.com") break; except: disconnect() connect() #更換分辨率 #setDisplay() #啟動瀏覽器 #driver = webdriver.Ie() #driver = webdriver.Chrome() #driver = webdriver.Firefox() #設置PhantomJS的user_agent dcap = dict(DesiredCapabilities.PHANTOMJS) user_agent = getUA() print(user_agent) dcap["phantomjs.page.settings.userAgent"] = ( user_agent ) #dcap["phantomjs.page.settings.resourceTimeout"] = (15000) dcap["phantomjs.page.settings.loadImages"] = (False) driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--load-images=no']) # UA = getUA() # print(UA) # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = UA # driver = webdriver.PhantomJS() driver.implicitly_wait(30) #清cookie driver.delete_all_cookies() #driver.maximize_window() # 瀏覽器全屏顯示 #打開百度 driver.get("http://www.baidu.com/") #driver.get("http://mch.weiba01.com/2.php") #設置瀏覽器窗口大小 window_size = getWindowSize() driver.set_window_size(window_size[0], window_size[1]) #搜索某個關鍵詞 print('打開百度成功',driver.title) target = targetInfo[0] keyword = targetInfo[1] if len(targetInfo)>2: error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)] print(">>>>>>>>>>>>>>>點擊的關鍵詞:",keyword,"--->目標地址:",target,">>>>>>>>>>>>>>>>>>>>") if len(targetInfo)>2: #模擬錯誤關鍵詞 print("點擊錯誤關鍵詞:",error_keyword); driver.find_element_by_id("kw").send_keys(error_keyword) time.sleep(2) driver.find_element_by_id("su").click() time.sleep(5) driver.find_element_by_id("kw").clear() time.sleep(2) print("錯誤關鍵詞點擊完畢") driver.find_element_by_id("kw").send_keys(keyword) #time.sleep(2) #點擊搜索按鈕 print("...開始點擊搜索按鈕..") driver.find_element_by_id("su").click() #exit() print("...點擊完畢..") time.sleep(2) #獲取搜索結果頁 0:着陸頁 1:對應的鏈接對象 urls_res = get_search_url(driver) real_urls = urls_res[0] #get_search_url(driver)[1][2].click() #real_urls = get_real_url(urls) print("搜索出來的可點擊着陸頁個數:",len(real_urls)) print(real_urls) index = get_urlIndex(target,real_urls) print("目標index:",index) page = 1 while index == -1 and page <= 4: if page == 1: #點擊前面的幾個着陸頁,模擬用戶真實行為 items = get_random_index(index,len(real_urls)) #items = [4] print(items) click_search_url(driver,items) #下一頁 driver = click_nextBtn(driver) time.sleep(3) urls_res = get_search_url(driver) real_urls = urls_res[0] #real_urls = get_real_url(urls) print(real_urls) index = get_urlIndex(target,real_urls) page = page+1 if index > 4 and page == 1: #第一頁,隨機點擊兩個或一個 int = random.randint(1,2) if int == 2: items = get_random_index(index,len(real_urls)) else: items = [1] print(items) click_search_url(driver,items) if page >=5: print("沒有找到目標地址,放棄搜索...") print("關閉瀏覽器") driver.quit() time.sleep(5) data = get_ip() data.append(keyword) data.append(target) data.append("no_find") data.append(-1) data.append(-1) insert_db(data) continue print("目標在page",page,"當前排名:",index,real_urls[index]) print("反問最后的目標頁...") #driver.get(real_urls[index]) urls_res[1][index].click() time.sleep(5) nowhandle = driver.current_window_handle allhandles = driver.window_handles #目標頁和搜索欄目頁切換下 for handle in allhandles: if handle != nowhandle: print("切換到當前窗口") driver.switch_to_window(handle) stime = random.randint(15,25) #stime = 5; print("目標頁title:",driver.title,"停留-->",stime) time.sleep(stime) '''關閉當前窗口''' driver.close() '''回到原先的窗口''' print("切換到原來的窗口") driver.switch_to_window(nowhandle) print("title:",driver.title) #time.sleep(random.randint(40,60)) #time.sleep(5) #清除所有cookie print("打印cookie") cookie= driver.get_cookies() print(cookie) print("清除cookie") driver.delete_all_cookies() print("打印cookie:") cookie= driver.get_cookies() print(cookie) #關閉瀏覽器 print("關閉瀏覽器") time.sleep(5) #driver.close() driver.quit() #time.sleep(5) #數據庫記錄運行信息 data = get_ip() data.append(keyword) data.append(target) data.append("success") data.append(page) data.append(index) insert_db(data) except: data = get_ip() data.append(keyword) data.append(target) data.append("faild") data.append(-1) data.append(-1) insert_db(data)