我們使用webmagic爬取網站,最大的難點不是webmagic的使用,而是各大網站的反爬蟲。比如登錄后可見,比如限制IP一天中的訪問次數、訪問頻率。今天我們就用webdriver來實現自動登錄CSDN,拿到登陸后的cookies從而模擬登錄。
首先在加入依賴
us.codecraft webmagic-selenium 0.7.3 import java.util.Iterator; import java.util.Set; import java.util.concurrent.TimeUnit;
import org.apache.http.client.CookieStore;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
public class Main {
public static void main(String[] args) throws Exception {
// 初始化參數據
System.setProperty("webdriver.chrome.driver", "C:/bin/chromedriver.exe");
WebDriver driver = new ChromeDriver();
String baseUrl = "https://passport.csdn.net/account/login";
// 加載url
driver.get(baseUrl);
// 等待加載完成
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
// 獲取頁面元素
WebElement elemUsername = driver.findElement(By.name("username"));
WebElement elemPassword = driver.findElement(By.name("password"));
WebElement btn = driver.findElement(By.className("logging"));
WebElement rememberMe = driver.findElement(By.id("rememberMe"));
// 操作頁面元素
elemUsername.clear();
elemPassword.clear();
elemUsername.sendKeys("username");
elemPassword.sendKeys("password");
rememberMe.click();
btn.click();
// 提交表單
//btn.submit();
Thread.sleep(5000);
//driver.get("http://msg.csdn.net/");
Thread.sleep(5000);
// 獲取cookies
//driver.manage().getCookies();
Set<org.openqa.selenium.Cookie> cookies = driver.manage().getCookies();
System.out.println("Size: " + cookies.size());
Iterator<org.openqa.selenium.Cookie> itr = cookies.iterator();
CookieStore cookieStore = new BasicCookieStore();
while (itr.hasNext()) {
Cookie cookie = itr.next();
BasicClientCookie bcco = new BasicClientCookie(cookie.getName(), cookie.getValue());
bcco.setDomain(cookie.getDomain());
bcco.setPath(cookie.getPath());
cookieStore.addCookie(bcco);
}
}
}
如此便能拿到登錄后的cookie,后續需要訪問該網站其他網頁,只需將拿到的cookie放到請求中“騙過”服務器即可