基於HtmlParser的網絡爬蟲

本文轉載自查看原文 2013-02-03 13:59 2932 HtmlParser/ 信息抽取/ 搜索引擎

一、目標

獲取網頁中的超鏈接及鏈接名，如從http://www.hao123.com/開始，抓取所有hao123鏈接到的超鏈接，再以獲取到的鏈接網頁為目標，獲取它所鏈接到的網頁。

二、環境及開發工具

環境：Java

工具：MyEclipse

開發包：如圖

三、原理

網絡爬蟲是一個自動提取網頁的程序，它為搜索引擎從萬維網上下載網頁，是搜索引擎的重要組成。爬蟲從一個或若干初始網頁的URL開始，獲得初始網頁上的URL，在抓取網頁的過程中，不斷從當前頁面上抽取新的URL放入隊列，直到滿足系統的一定停止條件。

而htmlparser能夠很容易地提取到網頁的信息，例如對HTML進行有效信息搜索、鏈接提取、用於自動給頁面的鏈接文本加上鏈接的標簽、資源提取，例如對一些圖片、聲音的資源的處理，將網頁內容保存到本地等等，極大地方便開發。

四、類分析，如下圖：

各類說明：

FileNameList：內部用linkedlist存放解析出來的xml文件名即每一個鏈接的text

IndexsList: 用鏈接來記錄索引,即是每一層的層數

LinksList: 記錄解析過程中的超鏈接,

Parse:主要解析功能類

ParseTool：主要負責獲得並初始化解析的參數

ResultBean：封裝要解析的數據，方便寫入xml文件中

UserCase：提供一個用例樣例

WritetoXML:把每一層的解析結果寫入xml文件中

五、類實現如下：

1.FileNameList

 1 /**
 2  * 
 3  */
 4 package cn.guet.deep;
 5 
 6 import java.net.URL;
 7 import java.util.LinkedList;
 8 
 9 /**
10  * @author 梁渝銘
11  * @project name:HTMLParser
12  * @date:2011-11-24
13  */
14 public class FileNameList {
15 
16     /*
17      * 用鏈接來記錄xml文件名
18      */
19     private static LinkedList<String> links = new LinkedList<String>();
20     private static FileNameList fileNameList = new FileNameList();
21     private static int index = 0;
22     // 控制空鏈接名時，用來生成xml文件名
23     private static int flag = 0;
24 
25     private FileNameList() {
26 
27     }
28 
29     public static FileNameList getInstance() {
30         return fileNameList;
31     }
32 
33     /*
34      * 入隊列操作,檢查鏈接名是否為空
35      */
36     public void enQueue(String name) {
37         // 檢查文件名不合法時處理
38         if(links.contains(name)){
39             links.add(name + flag++);
40             return;
41         }
42         if (name != null) {
43             links.addLast(name);
44         }else {
45             links.addLast(("index" + flag++));
46         }
47     }
48 
49     public String next() {
50         return links.get(index++);
51     }
52 
53     public void free() {
54         links.clear();
55     }
56     
57 }

2.IndexsList

 1 /**
 2  * 
 3  */
 4 package cn.guet.deep;
 5 
 6 import java.util.LinkedList;
 7 
 8 /**
 9  * @author 梁渝銘
10  * @project name:HTMLParser
11  * @date:2011-11-21
12  * 記錄每一層的每一個file的索引
13  */
14 public class IndexsList {
15     
16     /*用鏈接來記錄索引
17      */
18     private static LinkedList<Integer> indexList = new LinkedList<Integer>();
19     private static IndexsList indexInstance = new IndexsList();
20     //控制隊列出隊
21     private static int flag = 0;
22     
23     private IndexsList(){
24         
25     }
26     
27     public static IndexsList getInstance(){
28         return indexInstance;
29     }
30     
31     public int next(){
32         return indexList.get(flag++);
33     }
34 
35     /*入隊列操作
36      */
37     public void enQueue(int index){
38         indexList.addLast(index);
39     }
40     
41     public int get(int index){
42         return indexList.get(index);
43     }
44     
45     public void free(){
46         indexList.clear();
47     }
48 }

3.LinksList

 1 /**
 2  * 
 3  */
 4 package cn.guet.deep;
 5 
 6 import java.util.HashSet;
 7 import java.util.LinkedList;
 8 import java.util.Set;
 9 
10 /**
11  * @author 梁渝銘
12  * @project name:HTMLParser
13  * @date:2011-11-21 記錄解析過程中的超鏈接
14  */
15 public class LinksList {
16 
17     /*
18      * 用鏈接來記錄隊列
19      */
20     private static LinkedList<String> links = new LinkedList<String>();
21     private static LinksList linksInstance = new LinksList();
22     //記錄已經訪問了的url，防止重復訪問
23     private static Set<String> visitedSet = new HashSet<String>();
24     //單例
25     private LinksList() {
26 
27     }
28 
29     public static LinksList getInstance() {
30         return linksInstance;
31     }
32 
33     /*
34      * 入隊列操作
35      */
36     public void enQueue(String url) {
37         // 先入隊，確保每個url只被訪問一次,去掉鏈接最后的'/'
38 /*        String url_temp = url;
39         if (url.endsWith("/")) {
40             url_temp = url.substring(0, url.lastIndexOf("/"));
41         }
42         visitedSet.add(url_temp);
43         if (links.isEmpty() || !visitedSet.contains(url_temp))*/
44         if(!visitedSet.contains(url))
45             links.addLast(url);
46     }
47 
48     public String get(int index) {
49         return links.get(index);
50     }
51 
52     public String next() {
53         String link = links.getFirst();
54         links.removeFirst();
55         return link;
56     }
57     
58     public void free(){
59         links.clear();
60     }
61     
62 }

4.Parse

  1 /**
  2  * 
  3  */
  4 package cn.guet.deep;
  5 
  6 import java.io.IOException;
  7 import java.util.LinkedList;
  8 
  9 import org.apache.log4j.Logger;
 10 import org.htmlparser.NodeFilter;
 11 import org.htmlparser.Parser;
 12 import org.htmlparser.filters.NodeClassFilter;
 13 import org.htmlparser.tags.LinkTag;
 14 import org.htmlparser.util.NodeList;
 15 import org.htmlparser.util.ParserException;
 16 
 17 
 18 /**
 19  * @author 梁渝銘
 20  * @project name:HTMLParser
 21  * @date:2011-11-21 主要解析類
 22  */
 23 public class Parse {
 24     private Logger logger = Logger.getLogger(this.getClass());
 25     /*
 26      * 初始化參數
 27      */
 28     private static LinksList linksList = LinksList.getInstance();
 29     private static IndexsList indexsList = IndexsList.getInstance();
 30     private static WritetoXML writetoXML = new WritetoXML();
 31     private static FileNameList fileNameList = FileNameList.getInstance();
 32     private static int floor = 0;
 33     private static int floor_count = 0;
 34     //每一個超鏈接解析出來的超鏈接的總數
 35     private static int len = 0;
 36     //xml使用的文件名
 37 
 38     private static String path = null;
 39     private static int count = 0;
 40     public Parse(){
 41         indexsList.enQueue(1);
 42     }
 43     
 44     public int getFloor() {
 45         return floor;
 46     }
 47 
 48     public void setFloor(int floor) {
 49         Parse.floor = floor;
 50     }
 51 
 52     public void setUrl(String url) {
 53         fileNameList.enQueue("index");
 54         linksList.enQueue(url);
 55     }
 56 
 57     public void setPath(String path) {
 58         Parse.path = path;
 59     }
 60 
 61     //釋放資源
 62     public void free(){
 63         try {
 64              linksList.free();
 65              logger.info("linksList had released...");
 66              indexsList.free();
 67              logger.info("linksList had released...");
 68              fileNameList.free();
 69              logger.info("linksList had released...");
 70              logger.info("all had released...");
 71         } catch (Exception e) {
 72            logger.error(e.getMessage());
 73         }
 74     }
 75 
 76     // 補全url的地址
 77     public String fillUrl(String domain,String url) {
 78         return url.indexOf("http://") != -1 ? url : domain + url;
 79     }
 80     //替換文件名不支持的字符
 81     public String replaceSpe(String link_name){
 82         link_name = link_name.replaceAll("[?]+", "")
 83         .replaceAll("[&nbsp;]+", "")
 84         .replaceAll("[&amp;]+", "")
 85         .replaceAll("[&lt;]+", "")
 86         .replaceAll("[&gt;]+", "")
 87         .replaceAll("[ ]+", "-");
 88         return link_name;
 89     }
 90     /*
 91      * 解析類，采用NodeFilter過濾
 92      */
 93     public void extractLinks() throws IOException {
 94         try {
 95             while(floor != 0) {
 96                 int temp = indexsList.next();
 97                 floor_count++;
 98                 //統計每一層的文件數
 99                 int file_count = 0;
100                 for (int i = 0; i < temp; i++) {    
101                     logger.info("該層的文件總數："+temp);
102                     //外層循環控制每一次有的文件數    
103                     file_count++;
104                     //每層中的每一個xml文件對應的結果集
105                     LinkedList<ResultBean> resultList = new LinkedList<ResultBean>();;            
106                     NodeFilter filter = new NodeClassFilter(LinkTag.class);
107                     Parser parse = new Parser();
108                     String url = linksList.next();            
109                     try {
110                         //使解析跳過異常，如500，403，404。。。造成解析異常而中止的鏈接
111                         //讓parser繼續解析
112                         logger.info("try parse.....");
113                         parse.setURL(url);
114                         parse.setEncoding(parse.getEncoding());
115                         logger.info("set Encoding....");
116                     } catch (Exception e) {
117                         logger.error(e.getMessage());
118                         e.printStackTrace();
119                         file_count--;
120                         continue;
121                     }
122                     //內層循環控制每一層中的由上一層鏈接解析出來的鏈接
123                     logger.info("before extract list...."+parse.getURL());
124                     NodeList list;
125                     try{
126                        list = parse.extractAllNodesThatMatch(filter);
127                     }catch(ParserException e){
128                         e.printStackTrace();
129                         continue;
130                     }
131                     logger.info("extracting ....");
132                     for(len=0;len<list.size();len++){
133                        LinkTag node = (LinkTag) list.elementAt(len);
134                        String link_name = replaceSpe(node.getLinkText());
135                        String link = fillUrl(url,node.extractLink());
136                       //封裝結果，並寫入xml文件中
137                        ResultBean resultBean = new ResultBean();  
138                        resultBean.setName(link_name);
139                        resultBean.setLink(link);
140                        resultList.add(resultBean); 
141                        fileNameList.enQueue(link_name);
142                        linksList.enQueue(link);        
143                        logger.info("第"+(count++)+"個鏈接, "+"第" +floor_count+"層:"+"第"+file_count+"個文件"+"name: " +node.getLinkText()+"link to: "+link);
144                     }
145                     indexsList.enQueue(len+1);
146                     try{
147                        writetoXML.writeToXML(path, floor_count,fileNameList.next(), resultList);
148                     }catch(Exception e){
149                         e.printStackTrace();
150                         continue;
151                     }
152                    
153                 }
154                 floor--;
155             }
156         } catch (Exception e) {
157             logger.error(e.getMessage());
158             e.printStackTrace();    
159         }
160     }
161 }

5.ParseTool

 1 /**
 2  * 
 3  */
 4 package cn.guet.deep;
 5 
 6 import java.io.IOException;
 7 
 8 import org.htmlparser.Parser;
 9 
10 /**
11  * @author 梁渝銘
12  * @project name:HTMLParser
13  * @date:2011-11-21
14  * 解析工具類
15  */
16 public class ParseTool {
17     
18     private Parse parse = new Parse();
19     
20     //開始解析 
21     public void parse() throws IOException{
22         parse.extractLinks();    
23     }
24     
25     //設置解析的url
26     public void setUrl(String url){
27         parse.setUrl(url);
28     }
29     
30     //設置解析存放路徑
31     public void setPath(String path){
32         parse.setPath(path);
33     }
34     
35     //設置要解析的層數
36     public void setFloor(int floor){
37         parse.setFloor(floor);
38     }
39     
40     //釋放資源
41     public void free(){
42         parse.free();
43     }
44 
45 }

6.WritetoXML

  1 /**
  2  * 
  3  */
  4 package cn.guet.deep;
  5 
  6 import java.io.File;
  7 import java.io.FileOutputStream;
  8 import java.io.IOException;
  9 import java.util.LinkedList;
 10 
 11 import org.apache.log4j.Logger;
 12 import org.dom4j.Document;
 13 import org.dom4j.DocumentHelper;
 14 import org.dom4j.Element;
 15 import org.dom4j.io.OutputFormat;
 16 import org.dom4j.io.XMLWriter;
 17 
 18 /**
 19  * @author 梁渝銘
 20  * @project name:HTMLParser
 21  * @date:2011-11-23 
 22  * 把解析得到的鏈接按層寫入xml文件中
 23  */
 24 public class WritetoXML {
 25     private Logger logger = Logger.getLogger(this.getClass());
 26 
 27     /*
 28      * 存放的xml樣式 ：
 29      * <links> 
 30      *      <link_sumary>
 31      *          <link_number>每個xml中總共的鏈接數</link_number>
 32      *      </link_sumary>
 33      *      <link> 
 34      *         <name> links_name </name> 
 35      *         <address> link_address <address>
 36      *      </link>   
 37      *      ...  
 38      *  </links>
 39      */
 40      private static int index = 0;
 41     /*
 42      * path表示xml文件存放的路徑，floor表示文件的層次,link_name作為xml文件名
 43      */
 44     public void writeToXML(String path, int floor, String link_name,
 45             LinkedList<ResultBean> links) throws IOException {
 46         //創建一個新的xml文檔
 47         Document document = DocumentHelper.createDocument();
 48         Element root = document.addElement("links");
 49         Element link_summary = root.addElement("link_summary");
 50         Element link_number = link_summary.addElement("link_number");
 51         link_number.addText(String.valueOf(links.size()));
 52         /*
 53          * 通過循環將解析結果集中的對象數據轉換成xml節點
 54          */
 55         for (int i = 0; i < links.size(); i++) {
 56             Element link = root.addElement("link");
 57             Element name = link.addElement("name");
 58             Element address = link.addElement("address");
 59             name.addText(links.get(i).getName());
 60             address.addText(links.get(i).getLink());
 61         }
 62         path = handlePath(path, floor, link_name);
 63         documentToXML(document, path);
 64     }
 65 
 66     /*
 67      * 把鏈接寫入xml中
 68      */
 69 
 70     public void documentToXML(Document document, String filePath) {
 71 
 72         // 使用org.dom4j.io包下的xmlwriter類將dom4j的文檔樹對象轉換為xml輸出
 73         XMLWriter xmlWriter = null;
 74         try {
 75             // 創建有格式和縮進的格式化輸出對象
 76             OutputFormat format = OutputFormat.createPrettyPrint();
 77             format.setEncoding("UTF-8");
 78             // 將新的文件輸出流對象和格式化對象封裝進實例化的xmlwriter對象中
 79             xmlWriter = new XMLWriter(new FileOutputStream(filePath), format);
 80             xmlWriter.write(document);
 81         } catch (IOException e) {
 82             logger.error(e.getMessage());
 83         }finally{
 84             //防止意外而無法關閉資源，造成浪費
 85             try {
 86                 xmlWriter.close();
 87             } catch (IOException e) {
 88                 e.printStackTrace();
 89             }
 90         }
 91     }
 92 
 93     /*
 94      * 按層分文件夾，處理每層的文件名，路徑等，返回一個xml文件的路徑名
 95      * 如：
 96      * D：/test/floor_1/index.xml
 97      */
 98     public String handlePath(String path, int floor, String link_name)
 99             throws IOException {
100         String filePath = path;
101         File file = new File(filePath);
102         //檢查路徑是否存在
103         if (!file.exists()) {
104             file.mkdirs();
105         }
106         filePath = path +"//" + "floor_" + floor; 
107         if(!(file = new File(filePath)).exists()){
108             file.mkdirs();
109         }
110         file = null;
111         if(link_name.equals(""))
112             link_name = "link_name" + index++;
113         return filePath + "//" + link_name + ".xml";
114     }
115 }

六、輸出結果

1.目錄集合，一個文件夾floor_x表示某一爬行層次：

2.一個文件夾存放着某一層爬行過的網頁，一個文件記錄表示一個網頁：

3.一個XML文件記錄一個網頁上的目標鏈接，記錄多個目標鏈接地址：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python網絡爬蟲之LXML與HTMLParser Python爬蟲常用之HtmlParser 網絡爬蟲網絡爬蟲網絡爬蟲 python--爬蟲入門（八）體驗HTMLParser解析網頁，網頁抓取解析整合練習爬蟲系列(一) 網絡爬蟲簡介網絡爬蟲_Requests庫網絡爬蟲實戰什么是網絡爬蟲？為什么要選擇Python寫網絡爬蟲？網絡爬蟲之密碼破解