java爬虫-妹子图

本文转载自查看原文 2020-03-09 14:27 1105 爬虫

一，分析

1.选择入口

打开 https://www.mzitu.com/ 主页，我们发现主页有200+页图片，我们如果从首页入手，这里可能不是全部图片。这里我们打开每日更新 https://www.mzitu.com/all/ ，从url上看这应该是所有的图片了，但是从网页上有个早期图片 的超链接 https://www.mzitu.com/old/ ，我们得知这两个url包含了所有的图片了。

2. 技术选型

作为爬虫学习阶段，我们的目标应该是不顾一切把想要的资源爬到手，至于使用java或者使用python，使用Linux还是Windows就有些无关紧要。

技术选型：HttpClient+Jsoup

3.深入分析

思路：

根据 https://www.mzitu.com/all/ 获取所有的album
- 使用HttpClient获取当前页面的html（字符串格式）
- 使用Jsoup解析html，获取每个album的url
获取每个album里的图片
- 根据每个album的url，获取每页的html
- 使用Jsoup解析html，获取图片src属性值
- 下载

第一步

获取所有的album的url

获取图片里的src，这里我们不禁想到怎么实现下一页呢？我们点击下一页按钮发现了规律

至此，我们就可以获取所有的图片src了，但是还有个问题，如果一个album只有51张图片，那我什么时候判断结束？也就是https://www.mzitu.com/224497/52 会出现什么？

判断终止条件

会出现404，找不到页面，这里就应该跳出去遍历别的相册了，此时，大致思路和问题就解决了，现在就是coding时间了。

二、代码

package com.my.crawler.util;


import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;



public class HttpUtils {

    public static PoolingHttpClientConnectionManager cm = null;

    static{
        cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(100); // 设置最大连接数
        cm.setDefaultMaxPerRoute(10); // 设置主机的最大连接数
    }

    // 目的：传递一个url（http的地址），返回对应地址下的HTML的文档
    public static String getHtml(String url){
        CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet(url);
        /**设置参数*/
        httpGet.setConfig(setConfig());
        /*************************添加代码*********************************/
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400");
        httpGet.setHeader("Referer","https://www.mzitu.com/");
        /*************************添加代码*********************************/
        CloseableHttpResponse response = null;
        // 封装网站中的内容
        String html = "";
        try {
            response = client.execute(httpGet);
            // 响应成功(200)
            if(response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                html = EntityUtils.toString(httpEntity,"UTF-8");
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            // 使用连接池不需要关闭
        }
        return html;
    }

    public static String getImg(String url){
        CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet(url);
        /**设置参数*/
        httpGet.setConfig(setConfig());
        /*************************添加代码*********************************/

        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400");
        httpGet.setHeader("Referer","https://www.mzitu.com/");
        /*************************添加代码*********************************/
        CloseableHttpResponse response = null;
        // 图片名称
        String img = "";
        try {
            response = client.execute(httpGet);
            // 响应成功(200)
            if(response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                String ext = url.substring(url.lastIndexOf("."));
                img = UUID.randomUUID().toString()+ext;
                OutputStream outputStream = new FileOutputStream(new File("D:\\images\\"+img));
                httpEntity.writeTo(outputStream);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            // 使用连接池不需要关闭
        }
        return img;
    }

    public static String getImg(String url,String dir){
        CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet(url);
        /**设置参数*/
        httpGet.setConfig(setConfig());
        /*************************添加代码*********************************/

        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400");
        httpGet.setHeader("Referer","https://www.mzitu.com/");
        /*************************添加代码*********************************/
        CloseableHttpResponse response = null;
        // 图片名称
        String img = "";
        try {
            response = client.execute(httpGet);
            // 响应成功(200)
            if(response.getStatusLine().getStatusCode()==200){
                HttpEntity httpEntity = response.getEntity();
                String ext = url.substring(url.lastIndexOf("."));
                img = UUID.randomUUID().toString()+ext;
                // 先建文件夹
                File file = new File("D:\\images\\" + dir+"\\");
                if (!file.exists()){
                    file.mkdirs();
                }
                OutputStream outputStream = new FileOutputStream(new File("D:\\images\\"+dir+"\\"+img));
                httpEntity.writeTo(outputStream);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            // 使用连接池不需要关闭
        }
        return img;
    }

    // 用来设置HttpClient的参数
    private static RequestConfig setConfig() {
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(1000) // 创建连接的超时时间，单位毫秒
                .setConnectionRequestTimeout(1000) // 从连接池中创建连接的超时时间，单位毫秒
                .setSocketTimeout(10*1000) // 数据传输的超时时间，单位毫秒；启动如果你的网络慢或者你访问的是国外的url，有可能出现超时异常
                .build();
        return config;
    }

    // 测试
    public static void main(String[] args) {
        // 测试getHtml
        String html = getHtml("https://www.mzitu.com/224497/52");
        System.out.println(html.equals(""));
    }
}

public class MzituTask {

    public static void main(String[] args) {
        // 初始化url
        String url = "https://www.mzitu.com/all/";
        // 获取所有页面
        String html = HttpUtils.getHtml(url);
        // 解析url
        Document document = Jsoup.parse(html);
        Elements elements = document.select("[target=_blank]");
        //System.out.println(elements.text());
        for (Element element : elements) {
            String albumUrl = element.attr("href");
            // 遍历解析每一个URL,得到每一个相册的html
            String eachAlblumHtml = HttpUtils.getHtml(albumUrl);   
            // 根据第一页找剩下的URL
            int page = 1;
            while (true) {
                try {
                    String singleUrl = albumUrl+"/"+page;
                    String singleHtml = HttpUtils.getHtml(singleUrl);
                    System.out.println(singleHtml);
                    if (singleHtml.equals("")) {
                        break;
                    }
                    Document singleDoc = Jsoup.parse(singleHtml);
                    // 找到想要的照片信息
                    Elements imgElements = singleDoc.select(".main-image img");
                    if (imgElements.size() > 0) {
                        String imgSrc = imgElements.get(0).attr("src");
                        HttpUtils.getImg(imgSrc,element.text());
                        // 睡1s
                        Thread.sleep(1000);
                    }
                    page++;
                } catch (Exception e) {
                    e.printStackTrace();
                    continue;
                }
            }
        }
    }
}

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 python爬虫-妹子图一个爬虫的练习（妹子图）爬虫爬取妹子图爬虫练习--爬妹子图福利爬虫妹子图之获取种子url Scrapy框架实战-妹子图爬虫 Python爬虫之——爬取妹子图片 Python 爬虫：煎蛋网妹子图煎蛋网妹子图爬虫总结 Python 爬虫入门(二)——爬取妹子图