一、網頁圖片爬取類
package com.yhyl.utils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* @program: springboot-sqlserver-elasticsearch-api
* @description
* @author: xbwen
* @create: 2021-07-22 16:40
**/
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
String url = "https://www.yeitu.com/meinv/xinggan/20180919_14722.html";
Set<String> urlList = new HashSet<>();
urlList.add(url);
ExecutorService executorService = Executors.newFixedThreadPool(24);
Set<String> parseUrlList = new HtmlParseUtil().parseUrl(urlList);
Set<String> dataList = new HtmlParseUtil().parseData(parseUrlList,executorService);
executorService.shutdown();
}
//解析網頁地址,遞歸模擬瀏覽器網頁請求,獲取自己想要得頁面數
public Set<String> parseUrl(Set<String> urlList) throws IOException{
if(urlList.size()>=50){
return urlList;
}
Set<String> tempList = new HashSet<>();
for (String url : urlList) {
Document document = Jsoup.parse(new URL(url), 30000);
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
String href = element.attr("href");
if(href.endsWith(".html")){
tempList.add(href);
}
}
}
urlList.addAll(tempList);
urlList = parseUrl(urlList);
return urlList;
}
//根據網頁地址,模擬瀏覽器請求,獲取頁面圖片鏈接地址,並下載到本地磁盤
public Set<String> parseData(Set<String> urlList, ExecutorService executorService){
Set<String> contents = new HashSet<>();
for (String url : urlList) {
executorService.execute(new Thread() {
@Override
public synchronized void run(){
try {
Document document = Jsoup.parse(new URL(url), 30000);
Elements imgBoxElements = document.getElementsByClass("img_box");
for (Element element : imgBoxElements) {
Elements imgElements = element.getElementsByTag("img");
for (Element imgElement : imgElements) {
String title = imgElement.attr("alt");
String img = imgElement.attr("src");
System.out.println("下載完成:"+title+"@"+img);
// HtmlParseUtil htmlParseUtilProxy = SpringContextHolder.getBean(HtmlParseUtil.class);
downloadPicture(title,img,"E:\\picture\\"+title+".jpg");
// contents.add(title+"@"+img);
}
}
}catch (Exception e){
e.printStackTrace();
}
}
});
}
return contents;
}
//鏈接url下載圖片
@Async
public void downloadPicture(String imgTitle, String imgUrl, String path) throws Exception{
URL url = new URL(imgUrl);
// 打開連接
URLConnection conn = url.openConnection();
// HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(50000);
conn.setReadTimeout(50000);
// conn.setRequestProperty(":authority", "file.jiutuvip.com");
// conn.setRequestProperty(":method", "GET");
// conn.setRequestProperty(":path", "/2021/0105/20210105101307685.jpg");
// conn.setRequestProperty(":scheme","https");
conn.setRequestProperty("accept","image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8");
conn.setRequestProperty("accept-encoding", "gzip, deflate, br");
conn.setRequestProperty("accept-language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("if-modified-since", "Tue, 05 Jan 2021 02:13:07 GMT");
conn.setRequestProperty("if-none-match", "5ff3cb33-6aa5");
conn.setRequestProperty("referer", "https://www.yeitu.net/");
conn.setRequestProperty("sec-ch-ua", " Not;A Brand;"+"v=\"99\", \"Google Chrome;"+"v=\"91\", \"Chromium;"+"v=\"91\"");
conn.setRequestProperty("sec-ch-ua-mobile", "?0");
conn.setRequestProperty("sec-fetch-dest", "image");
conn.setRequestProperty("sec-fetch-mode", "no-cors");
conn.setRequestProperty("sec-fetch-site", "cross-site");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36");
conn.connect();
// List<Object> objects = new ArrayList<>();
// 輸入流
InputStream is = conn.getInputStream();
// objects.add(Base64Utils.encode(imgTitle.getBytes()));
// objects.add(is);
// jdbcTemplate.update("insert into image(image_title,image_stream) values (?,?)",objects.toArray());
// 1K的數據緩沖
byte[] bs = new byte[1024];
// 讀取到的數據長度
int len;
// 輸出的文件流
// String filename = "D:\\圖片下載/" + i + ".jpg"; //下載路徑及下載圖片名稱
File file = new File(path);
FileOutputStream os = new FileOutputStream(file, true);
// 開始讀取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完畢,關閉所有鏈接
os.close();
is.close();
}
}
二、POM依賴文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.yhyl</groupId>
<artifactId>springboot-14-leetcode</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>springboot-14-leetcode</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<spring-cloud.version>2020.0.2</spring-cloud.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-eureka-server</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
<version>8.4.1.jre8</version>
</dependency>
<!-- JDBC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
</dependencies>
<!--<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>-->
</project>
三、SpringBoot上下文配置
package com.yhyl.utils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
/**
* @author Jie
* @date 2019-01-07
*/
@Slf4j
public class SpringContextHolder implements ApplicationContextAware, DisposableBean {
private static ApplicationContext applicationContext = null;
/**
* 從靜態變量applicationContext中取得Bean, 自動轉型為所賦值對象的類型.
*/
@SuppressWarnings("unchecked")
public static <T> T getBean(String name) {
assertContextInjected();
return (T) applicationContext.getBean(name);
}
/**
* 從靜態變量applicationContext中取得Bean, 自動轉型為所賦值對象的類型.
*/
public static <T> T getBean(Class<T> requiredType) {
assertContextInjected();
return applicationContext.getBean(requiredType);
}
/**
* 檢查ApplicationContext不為空.
*/
private static void assertContextInjected() {
if (applicationContext == null) {
throw new IllegalStateException("applicaitonContext屬性未注入, 請在applicationContext" +
".xml中定義SpringContextHolder或在SpringBoot啟動類中注冊SpringContextHolder.");
}
}
/**
* 清除SpringContextHolder中的ApplicationContext為Null.
*/
private static void clearHolder() {
log.debug("清除SpringContextHolder中的ApplicationContext:"
+ applicationContext);
applicationContext = null;
}
@Override
public void destroy(){
SpringContextHolder.clearHolder();
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
if (SpringContextHolder.applicationContext != null) {
log.warn("SpringContextHolder中的ApplicationContext被覆蓋, 原有ApplicationContext為:" + SpringContextHolder.applicationContext);
}
SpringContextHolder.applicationContext = applicationContext;
}
}
四、SpringBoot啟動類
@SpringBootApplication
@EnableAsync
public class LeectCodeApplication {
public static void main(String[] args) {
SpringApplication.run(LeectCodeApplication.class, args);
}
@Bean
public SpringContextHolder springContextHolder() {
return new SpringContextHolder();
}
}
五、執行結果