正則表達式抓取文件內容中的http鏈接地址


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//正則表達式抓取網頁數據
public class HtmlAddressCatch {

public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
	
}

//給定http鏈接抓取地址
public static void getWebTextContent(String webaddress){
try {
URL url = new URL(webaddress);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
FileOutputStream file = new FileOutputStream("D:\text.txt");
InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是為了將InputStream字節流轉換成為字符流,一次讀取更多的字節
BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是為了在InputStreamReader的基礎上一次讀取更多的字節
int i=0;
String regex = "https?😕/\w+\.\w+\.\w+";
Pattern p = Pattern.compile(regex);
while((i=packetreader.read())!=-1)
{
String str = packetreader.readLine();
Matcher m = p.matcher(str);
while(m.find())
{
file.write((m.group()+"\r\n").getBytes());
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

// 從本地test.html文件抓取http鏈接和郵箱地址
public static void getLocalTextContent(String localaddress,String targetaddress){
try {
FileInputStream reader = new FileInputStream(localaddress);
FileOutputStream writer = new FileOutputStream(targetaddress);
byte[] buf = new byte[200];
int point = 0;
//String regex = "https?😕/\w+\.\w+\.\w+";http鏈接抓取
String regex = "\w+@\w+\.\w+";//郵箱地址抓取
Pattern p = Pattern.compile(regex);
while((point=reader.read(buf))>0)
{
Matcher m = p.matcher(new String(buf));
while(m.find())
{
writer.write((m.group()+"\r\n").getBytes());
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM