正則表達式抓取文件內容中的http鏈接地址

本文轉載自查看原文 2016-09-09 00:58 5839 Small tools/ Regex Notes/ IO Notes/ Java SE Learning Process

 import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//正則表達式抓取網頁數據

public class HtmlAddressCatch {
public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
	
}

//給定http鏈接抓取地址

public static void getWebTextContent(String webaddress){	

try {

URL url = new URL(webaddress);

HttpURLConnection con = (HttpURLConnection)url.openConnection();

FileOutputStream file = new FileOutputStream("D:\text.txt");

InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是為了將InputStream字節流轉換成為字符流，一次讀取更多的字節

BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是為了在InputStreamReader的基礎上一次讀取更多的字節

int i=0;

String regex = "https?😕/\w+\.\w+\.\w+";

Pattern p = Pattern.compile(regex);

while((i=packetreader.read())!=-1)

{

String str = packetreader.readLine();

Matcher m = p.matcher(str);

while(m.find())

{

file.write((m.group()+"\r\n").getBytes());

}

}

} catch (MalformedURLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}
}
// 從本地test.html文件抓取http鏈接和郵箱地址

public static void getLocalTextContent(String localaddress,String targetaddress){

try {

FileInputStream reader = new FileInputStream(localaddress);

FileOutputStream writer = new FileOutputStream(targetaddress);

byte[] buf = new byte[200];

int point = 0;

//String regex = "https?😕/\w+\.\w+\.\w+";http鏈接抓取

String regex = "\w+@\w+\.\w+";//郵箱地址抓取

Pattern p = Pattern.compile(regex);

while((point=reader.read(buf))>0)

{

Matcher m = p.matcher(new String(buf));

while(m.find())

{

writer.write((m.group()+"\r\n").getBytes());

}

}

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 微軟面試題：正則表達式提取鏈接地址 C#中使用正則表達式提取超鏈接地址的集中方法 C#中使用正則表達式提取超鏈接地址的集中方法正則表達式驗證HTTP地址是否合法 PHP正則表達式提取html超鏈接中的href地址使用python的讀取文件內容和正則表達式正則表達式提取url中的ip地址正則表達式 ip地址詳細地址正則表達式 python 自學第二課：使用BeautifulSoup抓取鏈接正則表達式