selenium调用浏览器进行抓取页面

本文转载自查看原文 2018-03-09 17:40 950 分享

环境：win10+jdk1.8+eclipse

创建maven项目配置pom.xm

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>Slenium-phantomjs</groupId>
    <artifactId>Slenium-phantomjs</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <build>
        <sourceDirectory>src</sourceDirectory>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.5.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.4</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-remote-driver -->
<dependency>
    <groupId>org.seleniumhq.selenium</groupId>
    <artifactId>selenium-remote-driver</artifactId>
    <version>3.5.3</version>
</dependency>
        
        <!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver -->
        <!-- https://mvnrepository.com/artifact/com.codeborne/phantomjsdriver -->
<dependency>
    <groupId>com.codeborne</groupId>
    <artifactId>phantomjsdriver</artifactId>
    <version>1.4.3</version>
</dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>


    </dependencies>
</project>

编写实现类

public class SeleniumTest {

    public static String url = "http://weibo.com/login.php";

    public static void main(String[] args) {
        /*WebDriver webDriver = PhantomJsUtil.getPhantomJs();
        PhantomJsUtil.screenshot(webDriver);*/
        
        getHtmlByPhantomjs();
        
        /*WebDriver driver = PhantomJsUtil.getPhantomJs();
        driver.get("http://www.itest.info");

        String title = driver.getTitle();
        System.out.printf(title);

        driver.close();*/
    }
    
    /***
     * 
     * getHtmlByPhantomjs:(这里用一句话描述这个方法的作用). <br/>
     *
     * @author micro
     * @since JDK 1.8
     */
    public static void getHtmlByPhantomjs() {
        // 获取方式
        WebDriver webDriver = null;
        try {
            webDriver = PhantomJsUtil.getPhantomJs();
            webDriver.manage().timeouts().pageLoadTimeout(3,TimeUnit.SECONDS);
            webDriver.get(url);

            SleepUtil.sleep(Constant.SEC_5);
            PhantomJsUtil.screenshot(webDriver);
            WebDriverWait wait = new WebDriverWait(webDriver, 10);
            String inputId = "pl_unlogin_home_feed";
            wait.until(ExpectedConditions.presenceOfElementLocated(By.id(inputId)));// 开始打开网页，等待输入元素出现
            Document document = Jsoup.parse(webDriver.getPageSource());
            System.out.println(document.html());
            System.out.println(document.text());
            // TODO 剩下页面的获取就按照Jsoup获取方式来做
        }catch (Exception e) {
            // TODO: handle exception
            e.printStackTrace();
        } finally {
            if (webDriver != null) {
                webDriver.close();
                webDriver.quit();
            }
        }
    }
    
    public void getScreenShot() {
        String BLANK = "  ";
        try {
            Process process = Runtime.getRuntime().exec(
                    "D:/develop_software/phantomjs/bin/phantomjs.exe" + BLANK //你的phantomjs.exe路径
                    + "D:/screenshot.js" + BLANK //就是上文中那段javascript脚本的存放路径
                    + "http://www.baidu.com" + BLANK //你的目标url地址
                    + "D:/baidu.png");//你的图片输出路径

            InputStream inputStream = process.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
            String tmp = "";
            while ((tmp = reader.readLine()) != null) {
                if (reader != null) {
                    reader.close();
                }
                if (process != null) {
                    process.destroy();
                    process = null;
                }
                System.out.println("渲染成功...");
            }
        } catch (IOException e) {
            
            // TODO Auto-generated catch block
            e.printStackTrace();
            
        }
    }

}

上述方法分别实现类网页的抓取和快照的生成，然后具体的规则需要根据某些网站的排版编写css规则或xpath，来精确获取文本内容。

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 Selenium_Chrome浏览器调用 selenium之浏览器页面控制（python） java，利用Selenium调用浏览器，动态模拟浏览器事件，动态获取页面信息 Python对Selenium调用浏览器进行封装包括启用无头浏览器，及对应的浏览器配置文件 vscode调用浏览器调试页面 curl模拟浏览器进行phpQuery抓取数据 Selenium调用使用360浏览器,QQ浏览器,遨游浏览器,猎豹浏览器,Chromium Selenium+Python浏览器调用:Firefox Selenium+Python浏览器调用:IE Selenium 调用IEDriverServer打开IE浏览器