GetPageInfo 獲取數據、存入本地、從本地讀取數據
import lombok.SneakyThrows;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.*;
public class GetPageInfo {
public static void main(String[] args) throws Exception {
// 獲取網頁數據並保存到本地
String casData = ""; // 創建一個StringBuilder用於存儲爬取到並處理過的數據,存入本地
int count = 0;
Integer pages = 10; // 爬取的頁數
for(int i = 1;i <= pages;i ++){
String url = "https://www.xxx.com.cn/list/?p=" + (i + "");
// url = url.replace("pageNum",i + "");
String cas = getCas(url); // 調用爬取方法傳入url
System.out.println("第" + i+"頁數據獲取成功");
casData += cas + " "; // 字符串追加 每次追加完成后加個空格
// 每10頁寫入一次,提高效率
if(i % 10 == 0){
if(count == 0){
//將文件寫入本地
writeOcrStrtoFile(casData,"F:\\nistCasData","cas.txt");
System.out.println("第" + i/10 +"次保存成功");
casData = "";
count ++;
}else{
String tempRead = readFileByLines("F:\\nistCasData\\cas.txt");
tempRead += casData;
writeOcrStrtoFile(tempRead,"F:\\nistCasData","cas.txt");
System.out.println("第" + i/10 +"次保存成功");
casData = "";
}
}
}
// 將剩下i % 10 != 0的數據寫入
// 讀取本地文件
String readData = readFileByLines("F:\\nistCasData\\cas.txt");
readData += casData;
writeOcrStrtoFile(readData,"F:\\nistCasData","cas.txt");
// String[] arr = readData.split("\\s+"); // 分割一個或者多個空格
// for(int i = 0;i < arr.length;i ++){
// System.out.println(i + ":" + arr[i]);
// }
}
/**
* 獲取網頁數據
* @param url
*/
@SneakyThrows
public static String getCas(String url){
// 如果報錯,忽略url的https證書;http開頭的應該可以不用處理
HttpsUrlValidator.retrieveResponseFromServer(url);
// 加入url並編寫請求頭,打開瀏覽器控制台照着寫
Connection.Response response = Jsoup
.connect(url)
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
.header("Accept-Encoding","*/*")
.header("Accept-Language","zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja-JP;q=0.6,ja;q=0.5,ko-KR;q=0.4,ko;q=0.3")
.header("Connection","keep-alive")
.header("Content-Type","application/json;charset=UTF-8")
.header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36")
.timeout(10000) // 設置超時時間
.ignoreContentType(true)
.execute();
String html = response.body(); // 獲取到的html字符串
Document doc = Jsoup.parse(html); // 使用jsoup 進行語言轉換
// System.out.println(doc.select(".alink").size()); // 查看class="alink"的個數
String cas = doc.select(".alink").text(); // 獲取class="alink"的數據
return cas;
}
/**
* 保存文件到本地
* @param result 需要寫入的數據
* @param outPath 保存的路徑
* @param outFileName 保存的文件名
* @throws Exception
*/
public static void writeOcrStrtoFile(String result, String outPath, String outFileName) throws Exception {
File dir = new File(outPath);
if(!dir.exists()) {
dir.mkdirs();
}
File txt = new File(outPath + "/" + outFileName);
// 先刪除;否則會直接追加在之前的內容后面,成幾何倍數增長
if (txt.isFile() && txt.exists()) {
txt.delete();
}
// 再創建
if (!txt.exists()) {
txt.createNewFile();
}
byte bytes[] = new byte[512];
bytes = result.getBytes();
int b = bytes.length; // 是字節的長度,不是字符串的長度
FileOutputStream fos = new FileOutputStream(txt);
fos.write(bytes);
fos.flush();
fos.close();
}
/**
* 讀取本地文件(按行讀取),因為存的時候沒換行,所以按行讀取
* @param fileName 文件名
*/
public static String readFileByLines(String fileName) {
File file = new File(fileName);
String readData = "";
BufferedReader reader = null;
try {
String tempString = null;
reader = new BufferedReader(new FileReader(file));
// 一次讀一行,讀入null時文件結束
while ((tempString = reader.readLine()) != null) {
readData += tempString;
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
return readData;
}
}
忽略https證書(http應該不需要,沒試過)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;
public class HttpsUrlValidator {
static HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());
return true;
}
};
public final static String retrieveResponseFromServer(final String url) {
HttpURLConnection connection = null;
try {
URL validationUrl = new URL(url);
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection = (HttpURLConnection) validationUrl.openConnection();
final BufferedReader in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
final StringBuffer stringBuffer = new StringBuffer(255);
synchronized (stringBuffer) {
while ((line = in.readLine()) != null) {
stringBuffer.append(line);
stringBuffer.append("\n");
}
return stringBuffer.toString();
}
} catch (final IOException e) {
System.out.println(e.getMessage());
return null;
} catch (final Exception e1){
System.out.println(e1.getMessage());
return null;
}finally {
if (connection != null) {
connection.disconnect();
}
}
}
public static void trustAllHttpsCertificates() throws Exception {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
.getSocketFactory());
}
static class miTM implements javax.net.ssl.TrustManager,
javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
}
}