java搜索---網絡爬蟲實現

本文轉載自查看原文 2012-05-16 20:24 8205 java技術

搜索方面的東西，需要了解網絡爬蟲方面的知識

首先介紹每個類的功能：

DownloadPage.java的功能是下載此超鏈接的頁面源代碼.

FunctionUtils.java 的功能是提供不同的靜態方法，包括：頁面鏈接正則表達式匹配,獲取URL鏈接的元素,判斷是否創建文件,獲取頁面的Url並將其轉換為規范的Url,截取網頁網頁源文件的目標內容。

HrefOfPage.java 的功能是獲取頁面源代碼的超鏈接。

UrlDataHanding.java 的功能是整合各個給類，實現url到獲取數據到數據處理類。

UrlQueue.java 的未訪問Url隊列。

VisitedUrlQueue.java 已訪問過的URL隊列。

下面介紹一下每個類的源代碼：

DownloadPage.java 此類要用到HttpClient組件。

 
 
 
         
  
  
  
          View Code   
  
  
  
           package com.sreach.spider;  
  
  
  
             
  
  
  
           import java.io.IOException;  
  
  
  
           import org.apache.http.HttpEntity;  
  
  
  
           import org.apache.http.HttpResponse;  
  
  
  
           import org.apache.http.client.ClientProtocolException;  
  
  
  
           import org.apache.http.client.HttpClient;  
  
  
  
           import org.apache.http.client.methods.HttpGet;  
  
  
  
           import org.apache.http.impl.client.DefaultHttpClient;  
  
  
  
           import org.apache.http.util.EntityUtils;  
  
  
  
             
  
  
  
           public class DownloadPage  
  
  
  
           {  
  
  
  
             
  
  
  
               /**  
  
  
  
                * 根據URL抓取網頁內容  
  
  
  
                *   
  
  
  
                * @param url  
  
  
  
                * @return  
  
  
  
                */ 
  
  
  
               public static String getContentFormUrl(String url)  
  
  
  
               {  
  
  
  
                   /* 實例化一個HttpClient客戶端 */ 
  
  
  
                   HttpClient client = new DefaultHttpClient();  
  
  
  
                   HttpGet getHttp = new HttpGet(url);  
  
  
  
             
  
  
  
                   String content = null;  
  
  
  
             
  
  
  
                   HttpResponse response;  
  
  
  
                   try 
  
  
  
                   {  
  
  
  
                       /*獲得信息載體*/ 
  
  
  
                       response = client.execute(getHttp);  
  
  
  
                       HttpEntity entity = response.getEntity();  
  
  
  
             
  
  
  
                       VisitedUrlQueue.addElem(url);  
  
  
  
             
  
  
  
                       if (entity != null)  
  
  
  
                       {  
  
  
  
                           /* 轉化為文本信息 */ 
  
  
  
                           content = EntityUtils.toString(entity);  
  
  
  
             
  
  
  
                           /* 判斷是否符合下載網頁源代碼到本地的條件 */ 
  
  
  
                           if (FunctionUtils.isCreateFile(url)  
  
  
  
                                   && FunctionUtils.isHasGoalContent(content) != -1)  
  
  
  
                           {  
  
  
  
                               FunctionUtils.createFile(FunctionUtils  
  
  
  
                                       .getGoalContent(content), url);  
  
  
  
                           }  
  
  
  
                       }  
  
  
  
             
  
  
  
                   } catch (ClientProtocolException e)  
  
  
  
                   {  
  
  
  
                       e.printStackTrace();  
  
  
  
                   } catch (IOException e)  
  
  
  
                   {  
  
  
  
                       e.printStackTrace();  
  
  
  
                   } finally 
  
  
  
                   {  
  
  
  
                       client.getConnectionManager().shutdown();  
  
  
  
                   }  
  
  
  
                     
  
  
  
                   return content;  
  
  
  
               }  
  
  
  
             
  
  
  
           }

FunctionUtils.java 此類的方法均為static方法

 
 
 
         
  
  
  
          View Code   
  
  
  
           
  
  
  
          package com.sreach.spider;  
  
  
  
           
  
  
  
          import java.io.BufferedWriter;  
  
  
  
          import java.io.File;  
  
  
  
          import java.io.FileOutputStream;  
  
  
  
          import java.io.IOException;  
  
  
  
          import java.io.OutputStreamWriter;  
  
  
  
          import java.util.regex.Matcher;  
  
  
  
          import java.util.regex.Pattern;  
  
  
  
           
  
  
  
          public class FunctionUtils  
  
  
  
          {  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 匹配超鏈接的正則表達式  
  
  
  
               */ 
  
  
  
              private static String pat = "http://www\\.oschina\\.net/code/explore/.*/\\w+\\.[a-zA-Z]+";  
  
  
  
              private static Pattern pattern = Pattern.compile(pat);  
  
  
  
           
  
  
  
              private static BufferedWriter writer = null;  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 爬蟲搜索深度  
  
  
  
               */ 
  
  
  
              public static int depth = 0;  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 以"/"來分割URL,獲得超鏈接的元素  
  
  
  
               *   
  
  
  
               * @param url  
  
  
  
               * @return  
  
  
  
               */ 
  
  
  
              public static String[] divUrl(String url)  
  
  
  
              {  
  
  
  
                  return url.split("/");  
  
  
  
              }  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 判斷是否創建文件  
  
  
  
               *   
  
  
  
               * @param url  
  
  
  
               * @return  
  
  
  
               */ 
  
  
  
              public static boolean isCreateFile(String url)  
  
  
  
              {  
  
  
  
                  Matcher matcher = pattern.matcher(url);  
  
  
  
           
  
  
  
                  return matcher.matches();  
  
  
  
              }  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 創建對應文件  
  
  
  
               *   
  
  
  
               * @param content  
  
  
  
               * @param urlPath  
  
  
  
               */ 
  
  
  
              public static void createFile(String content, String urlPath)  
  
  
  
              {  
  
  
  
                  /* 分割url */ 
  
  
  
                  String[] elems = divUrl(urlPath);  
  
  
  
                  StringBuffer path = new StringBuffer();  
  
  
  
           
  
  
  
                  File file = null;  
  
  
  
                  for (int i = 1; i < elems.length; i++)  
  
  
  
                  {  
  
  
  
                      if (i != elems.length - 1)  
  
  
  
                      {  
  
  
  
           
  
  
  
                          path.append(elems[i]);  
  
  
  
                          path.append(File.separator);  
  
  
  
                          file = new File("D:" + File.separator + path.toString());  
  
  
  
           
  
  
  
                      }  
  
  
  
           
  
  
  
                      if (i == elems.length - 1)  
  
  
  
                      {  
  
  
  
                          Pattern pattern = Pattern.compile("\\w+\\.[a-zA-Z]+");  
  
  
  
                          Matcher matcher = pattern.matcher(elems[i]);  
  
  
  
                          if ((matcher.matches()))  
  
  
  
                          {  
  
  
  
                              if (!file.exists())  
  
  
  
                              {  
  
  
  
                                  file.mkdirs();  
  
  
  
                              }  
  
  
  
                              String[] fileName = elems[i].split("\\.");  
  
  
  
                              file = new File("D:" + File.separator + path.toString()  
  
  
  
                                      + File.separator + fileName[0] + ".txt");  
  
  
  
                              try 
  
  
  
                              {  
  
  
  
                                  file.createNewFile();  
  
  
  
                                  writer = new BufferedWriter(new OutputStreamWriter(  
  
  
  
                                          new FileOutputStream(file)));  
  
  
  
                                  writer.write(content);  
  
  
  
                                  writer.flush();  
  
  
  
                                  writer.close();  
  
  
  
                                  System.out.println("創建文件成功");  
  
  
  
                              } catch (IOException e)  
  
  
  
                              {  
  
  
  
                                  e.printStackTrace();  
  
  
  
                              }  
  
  
  
           
  
  
  
                          }  
  
  
  
                      }  
  
  
  
           
  
  
  
                  }  
  
  
  
              }  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 獲取頁面的超鏈接並將其轉換為正式的A標簽  
  
  
  
               *   
  
  
  
               * @param href  
  
  
  
               * @return  
  
  
  
               */ 
  
  
  
              public static String getHrefOfInOut(String href)  
  
  
  
              {  
  
  
  
                  /* 內外部鏈接最終轉化為完整的鏈接格式 */ 
  
  
  
                  String resultHref = null;  
  
  
  
           
  
  
  
                  /* 判斷是否為外部鏈接 */ 
  
  
  
                  if (href.startsWith("http://"))  
  
  
  
                  {  
  
  
  
                      resultHref = href;  
  
  
  
                  } else 
  
  
  
                  {  
  
  
  
                      /* 如果是內部鏈接,則補充完整的鏈接地址,其他的格式忽略不處理,如：a href="#" */ 
  
  
  
                      if (href.startsWith("/"))  
  
  
  
                      {  
  
  
  
                          resultHref = "http://www.oschina.net" + href;  
  
  
  
                      }  
  
  
  
                  }  
  
  
  
           
  
  
  
                  return resultHref;  
  
  
  
              }  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 截取網頁網頁源文件的目標內容  
  
  
  
               *   
  
  
  
               * @param content  
  
  
  
               * @return  
  
  
  
               */ 
  
  
  
              public static String getGoalContent(String content)  
  
  
  
              {  
  
  
  
                  int sign = content.indexOf("<pre class=\"");  
  
  
  
                  String signContent = content.substring(sign);  
  
  
  
           
  
  
  
                  int start = signContent.indexOf(">");  
  
  
  
                  int end = signContent.indexOf("</pre>");  
  
  
  
           
  
  
  
                  return signContent.substring(start + 1, end);  
  
  
  
              }  
  
  
  
           
  
  
  
              /**  
  
  
  
               * 檢查網頁源文件中是否有目標文件  
  
  
  
               *   
  
  
  
               * @param content  
  
  
  
               * @return  
  
  
  
               */ 
  
  
  
              public static int isHasGoalContent(String content)  
  
  
  
              {  
  
  
  
                  return content.indexOf("<pre class=\"");  
  
  
  
              }  
  
  
  
           
  
  
  
          } 
  
  
  
          HrefOfPage.java 此類為獲取頁面的超鏈接
    
    
    
            
     
     
     
             View Code   
     
     
     
              
     
     
     
             package com.sreach.spider;  
     
     
     
              
     
     
     
             public class HrefOfPage  
     
     
     
             {  
     
     
     
                 /**  
     
     
     
                  * 獲得頁面源代碼中超鏈接  
     
     
     
                  */ 
     
     
     
                 public static void getHrefOfContent(String content)  
     
     
     
                 {  
     
     
     
                     System.out.println("開始");  
     
     
     
                     String[] contents = content.split("<a href=\"");  
     
     
     
                     for (int i = 1; i < contents.length; i++)  
     
     
     
                     {  
     
     
     
                         int endHref = contents[i].indexOf("\"");  
     
     
     
              
     
     
     
                         String aHref = FunctionUtils.getHrefOfInOut(contents[i].substring(  
     
     
     
             , endHref));  
     
     
     
              
     
     
     
                         if (aHref != null)  
     
     
     
                         {  
     
     
     
                             String href = FunctionUtils.getHrefOfInOut(aHref);  
     
     
     
              
     
     
     
                             if (!UrlQueue.isContains(href)  
     
     
     
                                     && href.indexOf("/code/explore") != -1 
     
     
     
                                     && !VisitedUrlQueue.isContains(href))  
     
     
     
                             {  
     
     
     
                                 UrlQueue.addElem(href);  
     
     
     
                             }  
     
     
     
                         }  
     
     
     
                     }  
     
     
     
              
     
     
     
                     System.out.println(UrlQueue.size() + "--抓取到的連接數");  
     
     
     
                     System.out.println(VisitedUrlQueue.size() + "--已處理的頁面數");  
     
     
     
              
     
     
     
                 }  
     
     
     
              
     
     
     
             } 
    
    
    
            
UrlDataHanding.java 此類主要是從未訪問隊列中獲取url,下載頁面，分析url，保存已訪問url等操作，實現Runnable接口
    
    
    
            
     
     
     
             View Code   
     
     
     
              
     
     
     
             package com.sreach.spider;  
     
     
     
              
     
     
     
             public class UrlDataHanding implements Runnable  
     
     
     
             {  
     
     
     
                 /**  
     
     
     
                  * 下載對應頁面並分析出頁面對應的URL放在未訪問隊列中。  
     
     
     
                  * @param url  
     
     
     
                  */ 
     
     
     
                 public void dataHanding(String url)  
     
     
     
                 {  
     
     
     
                         HrefOfPage.getHrefOfContent(DownloadPage.getContentFormUrl(url));  
     
     
     
                 }  
     
     
     
                       
     
     
     
                 public void run()  
     
     
     
                 {  
     
     
     
                     while(!UrlQueue.isEmpty())  
     
     
     
                     {  
     
     
     
                        dataHanding(UrlQueue.outElem());  
     
     
     
                     }  
     
     
     
                 }  
     
     
     
             } 
    
    
    
            
UrlQueue.java 此類主要是用來存放未訪問的URL隊列
    
    
    
            
     
     
     
             View Code   
     
     
     
              
     
     
     
             package com.sreach.spider;  
     
     
     
              
     
     
     
             import java.util.LinkedList;  
     
     
     
              
     
     
     
             public class UrlQueue  
     
     
     
             {  
     
     
     
                 /**超鏈接隊列*/ 
     
     
     
                 public static LinkedList<String> urlQueue = new LinkedList<String>();  
     
     
     
                   
     
     
     
                 /**隊列中對應最多的超鏈接數量*/ 
     
     
     
                 public static final int MAX_SIZE = 10000;  
     
     
     
                   
     
     
     
                 public synchronized static void addElem(String url)  
     
     
     
                 {  
     
     
     
                     urlQueue.add(url);  
     
     
     
                 }  
     
     
     
                   
     
     
     
                 public synchronized static String outElem()  
     
     
     
                 {  
     
     
     
                     return urlQueue.removeFirst();  
     
     
     
                 }  
     
     
     
                   
     
     
     
                 public synchronized static boolean isEmpty()  
     
     
     
                 {  
     
     
     
                     return urlQueue.isEmpty();  
     
     
     
                 }  
     
     
     
                   
     
     
     
                 public  static int size()  
     
     
     
                 {  
     
     
     
                     return urlQueue.size();  
     
     
     
                 }  
     
     
     
                   
     
     
     
                 public  static boolean isContains(String url)  
     
     
     
                 {  
     
     
     
                     return urlQueue.contains(url);  
     
     
     
                 }  
     
     
     
              
     
     
     
             } 
    
    
    
            
VisitedUrlQueue.java 主要是保存已訪問過的URL，使用HashSet來保存，主要是考慮到每個訪問過的URL是不同。HashSet剛好符合這個要求
    
    
    
            
     
     
     
             View Code   
     
     
     
              
     
     
     
             package com.sreach.spider;  
     
     
     
              
     
     
     
             import java.util.HashSet;  
     
     
     
              
     
     
     
             /**  
     
     
     
              * 已訪問url隊列  
     
     
     
              * @author HHZ  
     
     
     
              *  
     
     
     
              */ 
     
     
     
             public class VisitedUrlQueue  
     
     
     
             {  
     
     
     
                 public static HashSet<String> visitedUrlQueue = new HashSet<String>();  
     
     
     
              
     
     
     
                 public synchronized static void addElem(String url)  
     
     
     
                 {  
     
     
     
                     visitedUrlQueue.add(url);  
     
     
     
                 }  
     
     
     
              
     
     
     
                 public synchronized static boolean isContains(String url)  
     
     
     
                 {  
     
     
     
                     return visitedUrlQueue.contains(url);  
     
     
     
                 }  
     
     
     
              
     
     
     
                 public synchronized static int size()  
     
     
     
                 {  
     
     
     
                     return visitedUrlQueue.size();  
     
     
     
                 }  
     
     
     
             } 
    
    
    
            
Test.java 此類為測試類
    
    
    
            
     
     
     
             View Code   
     
     
     
              
     
     
     
             import java.sql.SQLException;  
     
     
     
              
     
     
     
             import com.sreach.spider.UrlDataHanding;  
     
     
     
             import com.sreach.spider.UrlQueue;  
     
     
     
              
     
     
     
             public class Test  
     
     
     
             {  
     
     
     
               public static void main(String[] args) throws SQLException  
     
     
     
               {  
     
     
     
                   String url = "http://www.oschina.net/code/explore/achartengine/client/AndroidManifest.xml";  
     
     
     
                   String url1 = "http://www.oschina.net/code/explore";  
     
     
     
                   String url2 = "http://www.oschina.net/code/explore/achartengine";  
     
     
     
                   String url3 = "http://www.oschina.net/code/explore/achartengine/client";  
     
     
     
                     
     
     
     
                     
     
     
     
                   UrlQueue.addElem(url);  
     
     
     
                   UrlQueue.addElem(url1);  
     
     
     
                   UrlQueue.addElem(url2);  
     
     
     
                   UrlQueue.addElem(url3);  
     
     
     
                     
     
     
     
                   UrlDataHanding[] url_Handings = new UrlDataHanding[10];  
     
     
     
                     
     
     
     
                       for(int i = 0 ; i < 10 ; i++)  
     
     
     
                       {  
     
     
     
                           url_Handings[i] = new UrlDataHanding();  
     
     
     
                           new Thread(url_Handings[i]).start();  
     
     
     
                       }  
     
     
     
              
     
     
     
               }  
     
     
     
             } 
    
    
    
            
說明一下：由於我抓取的是針對oschina的，所以里面的url正則表達式不適合其他網站，需要自己修改一下。你也可以寫成xml來配置。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 java實現網絡爬蟲使用Java實現網絡爬蟲使用Java實現網絡爬蟲關於使用Java實現的簡單網絡爬蟲Demo java網絡爬蟲實現信息的抓取 hadoop中實現java網絡爬蟲【java爬蟲】---爬蟲+基於接口的網絡爬蟲 Java網絡爬蟲 Jsoup Java 網絡爬蟲，就是這么的簡單搜索引擎-網絡爬蟲