!!!!本節就是解決天眼查爬蟲問題:!!!!(看完有建議和想法的話記得交流一下)
主要思路---繞過驗證,使用代理ip,間接的通過手機端mbaidu.com 通過字符串拼接的方式獲取搜索天眼查網頁數據。
重點:
1.這里我采用的是python3+selenium+chromedriver寫的代碼,主要問題就是爬蟲程序執行起來速度慢,效率較低(部分有誤未修改)。
2.這個方式也是通過同事的建議采取的,既然不能從正面直接登錄采集就間接的通過手機端接口獲取網站頁面信息。
3.存在的缺點就是:針對少量數據的話可以使用此類方式,數據集量太大不合適。
此類方式我總結一下啊,猶如像大海撈針實用性不強只是供大家借鑒(親測)。
代理ip的問題,需要解決ip代理被限制,切換ip(我用的是付費代理ip軟件)。
直接上代碼:
1 # coding:utf-8 2 3 from selenium import webdriver 4 from selenium.webdriver.common.keys import Keys 5 from pymysql import connect 6 from selenium.webdriver.chrome.options import Options 7 from selenium.webdriver.support.ui import WebDriverWait 8 from bs4 import BeautifulSoup 9 import re 10 import time 11 import requests 12 13 since = time.time() 14 # 前台開啟瀏覽器模式 15 def openChrome(): 16 #設置無頭瀏覽關閉瀏覽器打開界面 17 18 option = webdriver.ChromeOptions() 19 # 打開chrome瀏覽器 20 # option.add_argument() 21 22 # 設置無頭瀏覽關閉瀏覽器打開界面 23 # option.add_argument('--headless') 24 # option.add_argument('--disable-gpu') 25 26 #關閉圖片加載 27 prefs = {"profile.managed_default_content_settings.images": 2} 28 option.add_experimental_option("prefs", prefs) 29 # {"code": 0, "success": "true", "msg": "", 30 # "data": [{"IP": "123.245.11.149", "Port": 32317}, {"IP": "220.165.30.196", "Port": 29096}, 31 # {"IP": "114.227.59.61", "Port": 24750}, {"IP": "117.68.144.63", "Port": 33782}, 32 # {"IP": "113.25.175.30", "Port": 15188}, {"IP": "60.161.142.110", "Port": 11071}, 33 # {"IP": "119.130.17.45", "Port": 36788}, {"IP": "116.55.182.73", "Port": 30259}, 34 # {"IP": "182.126.16.190", "Port": 20560}, {"IP": "175.4.21.49", "Port": 27780}]} 35 # 設置代理 36 # option.add_argument("--proxy-server=http://60.13.50.74:4369") 37 driver = webdriver.Chrome(chrome_options=option) 38 return driver 39 40 # 授權操作 41 def operationAuth(driver,name): 42 43 print("××××××××××××××××××××××××××××××××××××××××開始搜索采集數據××××××××××××××××××××××××××××××××××××××××") 44 url = "https://m.baidu.com" 45 driver.get(url) 46 wait=WebDriverWait(driver,0.5) 47 #設置等待時間,等待頁面加載完成再執行操作 48 input=driver.find_element_by_id('index-kw') 49 input.send_keys(name) 50 input.send_keys(Keys.ENTER) 51 try: 52 #點擊進入所需鏈接 53 # driver.find_element_by_xpath("//div[@class='result c-result'][1]/div[@class='c-container']/a[@class='c-blocka']/h3[@class='c-title c-gap-top-small wa-www-normal-title']/em").click() 54 # driver.find_element_by_xpath("//div[@class='result c-result c-clk-recommend'][1]/div[@class='c-container']/a[@class='c-blocka']/h3[@class='c-title c-gap-top-small wa-www-normal-title']").click() 55 driver.find_element_by_xpath("//div[@id='page']/div[@id='page-bd']/div[@id='results']/div[@class='result c-result'][1]/div[@class='c-container']").click() 56 except: 57 print("未找到相關數據!") 58 print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!請注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 59 #切換窗口 60 windows = driver.window_handles 61 driver.switch_to_window(windows[-1]) 62 63 # 獲取頁面元素,里面就包含了需要爬取的數據 64 data = driver.page_source 65 66 # print(data) 67 68 # 將html對象轉換為bs4對象 69 soup = BeautifulSoup(data, 'lxml') 70 71 72 73 #獲取所有img標簽 74 try: 75 img_list=soup.find_all("img") 76 # print(img_list) 77 src=img_list[2] 78 # print(src) 79 80 # < img alt = "廣州奧利門窗有限公司" class ="img expand-img" data-src="https://img.tianyancha.com/logo/lll/0528aacd716b2a87b693d5932280338e.png@!f_200x200" erro-src="" / > 81 82 #logo 圖片 83 logo_url=re.findall(r'"https://.*?"',str(src)) 84 # print(logo_url) 85 url=logo_url[0] 86 str1=re.compile('"') 87 logo=re.sub(str1," ",url) 88 # print(logo) 89 # 去除圖片鏈接前后空格 90 logo1 = logo.strip() 91 # print(logo1) 92 except: 93 logo1="-" 94 print(logo1) 95 96 # #獲取所有img標簽 97 # try: 98 # img_list=soup.find_all("img") 99 # # print(img_list) 100 # src=img_list[2] 101 # # print(src) 102 # 103 # # < img alt = "廣州奧利門窗有限公司" class ="img expand-img" data-src="https://img.tianyancha.com/logo/lll/0528aacd716b2a87b693d5932280338e.png@!f_200x200" erro-src="" / > 104 # 105 # #logo 圖片 106 # logo_url=re.findall(r'"https://.*?"',str(src)) 107 # # print(logo_url) 108 # url=logo_url[0] 109 # str1=re.compile('"') 110 # logo=re.sub(str1," ",url) 111 # # print(logo) 112 # # 去除圖片鏈接前后空格 113 # logo1 = logo.strip() 114 # print(logo1) 115 # log_pic(logo=logo1, name=search_name) 116 # logo2 = "./log_pics/" + search_name + ".jpg" + " url:" +logo1 117 # except: 118 # logo2="無數據" 119 # print(logo2) 120 # urllib.urlretrieve(logo,"\log_pics\%s.jpg"%name) 121 try: 122 # 公司名稱 123 name1 = driver.find_element_by_xpath("//div[@class='box']/div[@class='content']/div[@class='header']/h1[@class='name']").text 124 company = name1 125 except: 126 company = '無數據' 127 # print("公司名稱:"+company) 128 129 130 #獲取所有span標簽 131 span_lists=soup.find_all("span") 132 # print(span_lists) 133 134 try: 135 # 獲取所有span標簽 136 span_lists = soup.find_all("span") 137 # print(span_lists) 138 #電話 139 data_phone1=span_lists[8] 140 data_phone2=span_lists[9] 141 # print(data_phone) 142 # < span class ="pl10" > < script type="text/html" >["0757-85758083", "85756656"] < / script > < span class ="link-click ml10" onclick="openPhonePopup(this)" > 查看更多 < / span > < / span > 143 144 phone="" 145 # phones=re.findall(r"\d{4}-\d{8}|\d{8}",str(data_phone)) 146 # phones=re.findall(r"\d{4}-\d{7}|\d{7}|\d{4}-\d{8}|\d{8}",str(data_phone)) 147 phones1=re.findall(r"\d{4}-\d{7,8}|\d{3}-\d{8}|\d{11}|\d{8}",str(data_phone1)) 148 149 for phone_num1 in phones1: 150 phone=phone_num1+" "+phone 151 152 phones2=re.findall(r"\d{4}-\d{7,8}|\d{3}-\d{8}|\d{11}|\d{8}",str(data_phone2)) 153 for phone_num2 in phones2: 154 phone=phone_num2+" "+phone 155 except: 156 phone="-" 157 # print("電話:"+phone) 158 159 160 try: 161 # 郵箱 162 data_email1=span_lists[12] 163 data_email2=span_lists[13] 164 # print(data_email1) 165 # print(data_email2) 166 email="" 167 emails1=re.findall(r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",str(data_email1)) 168 for em1 in emails1: 169 email=em1+" "+email 170 emails2=re.findall(r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",str(data_email2)) 171 for em2 in emails2: 172 email=em2+" "+email 173 except: 174 email="-" 175 # print("郵箱:"+email) 176 177 178 try: 179 # 網址 180 page=driver.find_element_by_xpath("//div[@class='detail ']/div[@class='f0'][2]/div[@class='in-block'][1]/span[2]").text 181 except: 182 page="-" 183 # print("網址:"+page) 184 185 186 try: 187 #簡介 188 content=driver.find_element_by_xpath("//div[@class='content']/div[@class='detail ']/div[@class='summary']/span[2]").text 189 except: 190 content="-" 191 # print("簡介:"+content) 192 193 194 195 try: 196 # 法定代表人 197 represent =driver.find_element_by_xpath("//div[1]/div[@class='humancompany']/div[@class='name']/a[@class='link-click']").text 198 except: 199 represent = "-" 200 # print("法定代表人:"+represent) 201 202 try: 203 #注冊時間 204 register_time1=driver.find_element_by_xpath("//table[@class='table']/tbody/tr[2]/td/div[2]/text[@class='tyc-num lh24']").text 205 year = register_time1[:4] 206 y = trans(year) 207 mon = register_time1[5:7] 208 m = trans(mon) 209 day =register_time1[8:10] 210 d = trans(day) 211 register_time=y+'-'+m+'-'+d 212 except: 213 register_time="-" 214 215 try: 216 # 注冊號 217 register_num = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[1]/td[2]").text 218 except: 219 register_num = '-' 220 # print("注冊號:"+register_num) 221 222 223 try: 224 # 組織機構代碼 225 code = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[1]/td[4]").text 226 except: 227 code = '-' 228 # print("組織機構代碼:"+code) 229 230 231 try: 232 # 統一信用代碼 233 social_code=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[2]/td[2]").text 234 except: 235 social_code="-" 236 # print("統一信用代碼:"+social_code) 237 238 239 try: 240 # 公司類型 241 company_type = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[2]/td[4]").text 242 except: 243 company_type = '-' 244 # print("公司類型:"+company_type) 245 246 247 try: 248 # 所屬行業 249 trade = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[3]/td[4]").text 250 except: 251 trade = '-' 252 # print("所屬行業:"+trade) 253 254 255 try: 256 #營業期限 257 deadline = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[4]/td[2]").text 258 except: 259 deadline = "-" 260 # print("營業期限:"+deadline) 261 262 try: 263 #核准日期 264 right_day1=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']/tbody/tr[4]/td[4]/text[@class='tyc-num lh24']").text 265 year = right_day1[:4] 266 y = trans(year) 267 mon = right_day1[5:7] 268 m = trans(mon) 269 day =right_day1[8:10] 270 d = trans(day) 271 right_day=y+'-'+m+'-'+d 272 except: 273 right_day='-' 274 275 try: 276 #納稅人資質 277 qualification=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[5]/td[2]").text 278 except: 279 qualification="-" 280 # print("納稅人資質:"+qualification) 281 282 283 try: 284 #人員規模 285 pscale=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[5]/td[4]").text 286 except: 287 pscale="-" 288 # print("人員規模:"+pscale) 289 290 291 try: 292 #實繳資本 293 paid=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[6]/td[2]").text 294 except: 295 paid="-" 296 # print("實繳資本:"+paid) 297 298 299 try: 300 # 登記機關 301 registration_authority = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[6]/td[4]").text 302 except: 303 registration_authority = '-' 304 # print("登記機關:"+registration_authority) 305 306 307 try: 308 #參保人數 309 Insured_number=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[7]/td[2]").text 310 except: 311 Insured_number="-" 312 # print("參保人數:"+Insured_number) 313 314 315 try: 316 #英文名稱 317 E_name=driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[7]/td[4]").text 318 except: 319 E_name="-" 320 # print("英文名稱"+E_name) 321 322 323 try: 324 # 地址 325 addr =driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[8]/td[2]").text[:-4] 326 except: 327 addr = "-" 328 # print("地址:"+addr) 329 330 331 try: 332 # 經營范圍 333 scope = driver.find_element_by_xpath("//table[@class='table -striped-col -border-top-none']//tr[9]/td[2]/span[@class='select-none']/span[@class='js-shrink-container']/span[@class='js-split-container']/span[@class='tyc-num']/text[@class='tyc-num lh24']").text 334 except: 335 scope = "-" 336 # print("經營范圍:"+scope) 337 338 # print(company) 339 # print(phone) 340 # print(email) 341 # print(page) 342 # print(content) 343 # print(represent) 344 # print(register_time) 345 # print(register_num) 346 # print(code) 347 # print(social_code) 348 # print(company_type) 349 # print(trade) 350 # print(deadline) 351 # print(right_day) 352 # print(qualification) 353 # print(pscale) 354 # print(paid) 355 # print(registration_authority) 356 # print(Insured_number) 357 # print(E_name) 358 # print(addr) 359 # print(scope) 360 361 print("××××××××××××××××××××××××××××××××××××××××開始入庫××××××××××××××××××××××××××××××××××××××××") 362 #將數據存入數據庫 363 conn = connect(host="192.168.113.129", 364 port=3306, 365 database="datas", 366 user="root", 367 password="123456", 368 charset="utf8") 369 370 cursor = conn.cursor() 371 try: 372 insertsql = "insert into tianyancha_datas_test3(search_name,company,logo,phone,email,page,content,represent,register_time,register_num,code,social_code,company_type,trade,deadline,right_day,qualification,pscale,paid,registration_authority,Insured_number,E_name,addr,scope)" \ 373 "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 374 cursor.execute(insertsql,[search_name,company,logo1,phone,email,page,content,represent,register_time,register_num,code,social_code,company_type,trade,deadline,right_day,qualification,pscale,paid,registration_authority,Insured_number,E_name,addr,scope]) 375 print("————————————————————————————————————————入庫成功————————————————————————————————————————") 376 except: 377 print('————————————————————————————————————————入庫失敗————————————————————————————————————————') 378 cursor.close() 379 conn.commit() 380 conn.close() 381 382 #字符轉換 383 def trans(A): 384 num=[] 385 for i in range (0,len(A)): 386 if A[i]=="0": 387 zero=re.compile("0") 388 z=re.sub(zero,"5",A[i]) 389 num.append(z) 390 391 elif A[i]=="1": 392 one=re.compile("1") 393 o=re.sub(one,"2",A[i]) 394 # print(o,i) 395 num.append(o) 396 397 elif A[i]=="2": 398 two=re.compile("2") 399 t=re.sub(two,"9",A[i]) 400 # print(t,i) 401 num.append(t) 402 403 elif A[i]=="3": 404 three=re.compile("3") 405 t2=re.sub(three,"0",A[i]) 406 # print(t2,i) 407 num.append(t2) 408 409 elif A[i]=="4": 410 four=re.compile("4") 411 f=re.sub(four,"6",A[i]) 412 # print(f,i) 413 num.append(f) 414 415 elif A[i]=="5": 416 five=re.compile("5") 417 f2=re.sub(five,"4",A[i]) 418 # print(f2,i) 419 num.append(f2) 420 421 elif A[i]=="6": 422 six=re.compile("6") 423 s=re.sub(six,"7",A[i]) 424 # print(s,i) 425 num.append(s) 426 427 elif A[i]=="7": 428 seven=re.compile("7") 429 s2=re.sub(seven,"3",A[i]) 430 # print(s2,i) 431 num.append(s2) 432 433 elif A[i]=="8": 434 eight=re.compile("8") 435 e=re.sub(eight,"1",A[i]) 436 # print(e,i) 437 num.append(e) 438 439 elif A[i]=="9": 440 nine=re.compile("9") 441 n=re.sub(nine,"8",A[i]) 442 # print(n,i) 443 num.append(n) 444 # print(num) 445 446 number=''.join(num) 447 return number 448 449 450 #存到本地,暫未設置 451 def log_pic(logo,name): 452 html = requests.get(logo) 453 with open('./log_pics/'+name+'.jpg','wb') as file: 454 file.write(html.content) 455 456 457 458 # 方法主入口 459 if __name__ == '__main__': 460 # 加啟動配置 461 driver = openChrome() 462 list1 = ['長沙國盛動力設備有限公司', '長沙大禹建築防水工程有限公司株洲分公司', '長沙大運金屬材料有限公司'] 463 for i in range(len(list1)): 464 name = list1[i] 465 print('*' * 100) 466 467 print(name) 468 search_name=name 469 # operationAuth(driver, name=list1[i]+" 信用信息_訴訟信息_財務") 470 # operationAuth(driver, name=list1[i]+"_【信用信息_訴訟信息_財務 site:www.tianyancha.com") 471 operationAuth(driver, name=list1[i]+"_【信用信息_訴訟信息_財務信息_...") 472 time_elapsed=time.time()-since 473 print('Training complete in {:.0f}m {:.0f}s'.format( 474 time_elapsed // 60, time_elapsed % 60))