环境:win10+jdk1.8+eclipse
创建maven项目配置pom.xm
l
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>Slenium-phantomjs</groupId> <artifactId>Slenium-phantomjs</artifactId> <version>0.0.1-SNAPSHOT</version> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> </plugins> </build> <dependencies> <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>3.5.3</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.4</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> </dependency> <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-remote-driver --> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-remote-driver</artifactId> <version>3.5.3</version> </dependency> <!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver --> <!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver --> <dependency> <groupId>com.codeborne</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.4.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> </dependencies> </project>
编写实现类
public class SeleniumTest { public static String url = "http://weibo.com/login.php"; public static void main(String[] args) { /*WebDriver webDriver = PhantomJsUtil.getPhantomJs(); PhantomJsUtil.screenshot(webDriver);*/ getHtmlByPhantomjs(); /*WebDriver driver = PhantomJsUtil.getPhantomJs(); driver.get("http://www.itest.info"); String title = driver.getTitle(); System.out.printf(title); driver.close();*/ } /*** * * getHtmlByPhantomjs:(这里用一句话描述这个方法的作用). <br/> * * @author micro * @since JDK 1.8 */ public static void getHtmlByPhantomjs() { // 获取方式 WebDriver webDriver = null; try { webDriver = PhantomJsUtil.getPhantomJs(); webDriver.manage().timeouts().pageLoadTimeout(3,TimeUnit.SECONDS); webDriver.get(url); SleepUtil.sleep(Constant.SEC_5); PhantomJsUtil.screenshot(webDriver); WebDriverWait wait = new WebDriverWait(webDriver, 10); String inputId = "pl_unlogin_home_feed"; wait.until(ExpectedConditions.presenceOfElementLocated(By.id(inputId)));// 开始打开网页,等待输入元素出现 Document document = Jsoup.parse(webDriver.getPageSource()); System.out.println(document.html()); System.out.println(document.text()); // TODO 剩下页面的获取就按照Jsoup获取方式来做 }catch (Exception e) { // TODO: handle exception e.printStackTrace(); } finally { if (webDriver != null) { webDriver.close(); webDriver.quit(); } } } public void getScreenShot() { String BLANK = " "; try { Process process = Runtime.getRuntime().exec( "D:/develop_software/phantomjs/bin/phantomjs.exe" + BLANK //你的phantomjs.exe路径 + "D:/screenshot.js" + BLANK //就是上文中那段javascript脚本的存放路径 + "http://www.baidu.com" + BLANK //你的目标url地址 + "D:/baidu.png");//你的图片输出路径 InputStream inputStream = process.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String tmp = ""; while ((tmp = reader.readLine()) != null) { if (reader != null) { reader.close(); } if (process != null) { process.destroy(); process = null; } System.out.println("渲染成功..."); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
上述方法分别实现类网页的抓取和快照的生成,然后具体的规则需要根据某些网站的排版编写css规则或xpath,来精确获取文本内容。