替換賬號密碼,模擬微博登錄並爬取評論下的所有圖片評論
寫着玩的,用的是selenium,還沒來得及加phantomjs,沒用函數,一順寫下來的,寫的比較亂,效率也不是太高,見諒
純粹瞎搞,湊活能用
# -*- coding: utf-8 -*- """ Created on Tue Nov 27 16:25:29 2018 @author: Administrator """ from selenium import webdriver from bs4 import BeautifulSoup import time import re import os import urllib.request #模擬登錄 username = '你的賬號' password = '你的密碼' url1 = 'http://login.sina.com.cn/' chromedriver_dir = r'C:\Users\Administrator\AppData\Local\CentBrowser\Application\chromedriver.exe' # 注意,這里是你的安裝路徑 browser = webdriver.Chrome(chromedriver_dir) browser.get(url1) elem_user = browser.find_element_by_name("username") elem_user.send_keys(username) #用戶名 elem_pwd = browser.find_element_by_name("password") elem_pwd.send_keys(password) #密碼 time.sleep(20) elem_sub = browser.find_element_by_xpath("//input[@class='W_btn_a btn_34px']") elem_sub.click() #點擊登陸 因無name屬性 time.sleep(2) #打開需要爬的頁面 url2 = 'https://weibo.com/1740806873/Guauhitiu?refer_flag=1001030103_&type=comment#_rnd1543317436501' browser.get(url2) #滾動到底部並加載更多頁面 n=1; try: while n <= 100: js='var q=document.documentElement.scrollTop=100000' browser.execute_script(js) time.sleep(5) while n >= 3: browser.find_element_by_xpath("//span[@class='more_txt']").click() #點擊查看更多 time.sleep(5) n=n+1 except: print('加載完成') os.mkdir('wbpic') #創建文件夾 os.chdir('wbpic') #在文件夾內操作 #獲取圖片鏈接 data = browser.page_source soup1 = BeautifulSoup(data, 'lxml') imglist = soup1.find_all('img', {"src":re.compile("\/\/.*thumb180.*\.jpg")}) #發現html中帶img標簽的數據,輸出格式為<img xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,存入集合 lenth = len(imglist) #計算集合的個數 for i in range(lenth): picaddress = imglist[i].attrs['src'] #抓取img中屬性為src的信息,例如<img src="123456" xxxxxxxxxxxxxxxx,則輸出為123456 picaddress = picaddress.replace('//','https://') picaddress = picaddress.replace('thumb180','bmiddle') #print(picaddress) filename = picaddress.split('/')[-1] #切片,從/開始到結尾 with open(filename,'wb') as f: img = urllib.request.urlopen(picaddress).read() #下載進程 f.write(img) browser.close()