抓取網頁

本文轉載自查看原文 2019-09-04 12:19 351 C#_網絡

C# 讀取文本文件內容生成相應的文件，獲取目錄下所有文件名並保存為文本文

最近因為經常用到2個功能：
1):以一個文件內容為名批量生成相應的文件
2):查找一個目錄(包括子目錄)下某擴展名的所有文件
所以寫了一個小程序，方便多了。
先看效果圖：


雖然很簡單但須注意：
1. 
擴展名 區分大小寫
if (Path.GetExtension(file).ToLower() == mask.ToLower())
一開始沒注意這，害得找出的文件總是比正常的文件少
2.
去掉文件名中的非法字符
line = line.Replace("\\", string.Empty);
line = line.Replace("/", string.Empty);
line = line.Replace(":", string.Empty);
line = line.Replace("*", string.Empty);
line = line.Replace("?", string.Empty);
line = line.Replace("\"", string.Empty);
line = line.Replace("<", string.Empty);
line = line.Replace(">", string.Empty);
line = line.Replace("|", string.Empty);
//line = line.Replace(" ", string.Empty);
fs = new FileStream(fileSaveDir +"\\"+ line + ext, FileMode.Create);
3.
注意各種細節，一些小問題不容忽視，現在這個程序的excepitoin 處理還有一些模糊！ 暫時就不改了。
4.主要代碼
c#
//生成文件
//
private void btnCreate_Click(object sender, EventArgs e)//生成文件
        {
            FileStream fs;
            String line = "";
           // ext = Convert.ToString( comboBox1.SelectedItem);
            ext = comboBox1.Text;
            fileSaveDir = this.tbxSaveDir.Text;
            fileName = this.tbxFilename.Text;
            if (fileName == "")
            {
                MessageBox.Show("請選擇文件名的存放文件。");
                return;
            }
            if (fileSaveDir == "")
            {
                FileInfo fi = new FileInfo(fileName);
                fileSaveDir =Convert.ToString(fi.Directory);
            }
            try
            {
                using (StreamReader sr = new StreamReader(fileName))
                {
                    do
                    {
                        line = sr.ReadLine();
                        if (line != null)
                        {
                            String file = fileSaveDir + "\\" + line + ext;
                            if(File.Exists(file))
                            {
                                if (DialogResult.Yes == MessageBox.Show("文件 "+"\""+line+ext+"\""+" 已經存在了！", "是否忽略已經存在的文件", MessageBoxButtons.YesNo,MessageBoxIcon.Warning))
                                {
                                    continue;
                                }
                                else
                                {
                                    MessageBox.Show("一共生成了" + count + " 個文件。");
                                    return;
                                }
                            }
                            
                            line = line.Replace("\\", string.Empty);
                            line = line.Replace("/", string.Empty);
                            line = line.Replace(":", string.Empty);
                            line = line.Replace("*", string.Empty);
                            line = line.Replace("?", string.Empty);
                            line = line.Replace("\"", string.Empty);
                            line = line.Replace("<", string.Empty);
                            line = line.Replace(">", string.Empty);
                            line = line.Replace("|", string.Empty);
                            //line = line.Replace(" ", string.Empty);
                            
                            fs = new FileStream(fileSaveDir +"\\"+ line + ext, FileMode.Create);
                            //fs = new FileStream(line + ".txt", FileMode.Create);
                            count++;
                        }
                    } while (line != null);
                }
            }
            catch (ArgumentException arge)
            {
                MessageBox.Show(arge.Message);
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
            MessageBox.Show("一共生成了"+count+" 個文件。");
            count = 0;
            //this.comboBox1.SelectedIndex = 0;
        
        }

//獲取文件名
private void btnGetFileName_Click(object sender, EventArgs e)//獲取文件名
        {
            int fileCount = 0;
            bool fullname = checkBox1.Checked;
            if (this.tbxPath.Text =="" || this.tbxExten_tabPage2.Text == "" || this.tbxSavePath.Text == "")
            {
                MessageBox.Show("請選擇目錄及擴展名。");
                return;
            }
            String directory = this.tbxPath.Text;
            String mask = this.tbxExten_tabPage2.Text;
            String savepath = this.tbxSavePath.Text;
            findFiles(directory, mask, false,fullname, ref fileCount);
            File.Delete(savepath);
            FileStream fs = new FileStream(savepath , FileMode.CreateNew);
            StreamWriter sw = new StreamWriter(fs);
            foreach (string str in al)
                sw.WriteLine(str);
            sw.Close();
            fs.Close();
            MessageBox.Show("一共獲取了" + fileCount + "個文件名。");
            
            fileCount = 0;
            al.Clear();
        }
public void findFiles(string directory, string mask, bool ignoreHidden,bool fullname, ref int fileCount)//獲取文件名
        {
            //先查找當前目錄下指定后綴名的所有文件
            foreach (string file in Directory.GetFiles(directory, "*.*"))　//System Volume Information　unauthorizedAccessException
            {
                if (!(ignoreHidden && (File.GetAttributes(file) & FileAttributes.Hidden) == FileAttributes.Hidden))
                {
                    if (mask != "")
                    {
                        if (Path.GetExtension(file).ToLower() == mask.ToLower())
                        {
                            FileInfo fi = new FileInfo(file);
                            String name="";
                            if (fullname)
                            {
                                name = fi.FullName;
                            }
                            else
                            {
                                name = fi.Name;//.Replace(mask,"");
                            }
                            al.Add(name);
                            fileCount++;
                        }
                    }
                }
            }          
            string[] childDirectories = Directory.GetDirectories(directory);
            foreach (string dir in childDirectories)
            {
                if (!(ignoreHidden && (File.GetAttributes(dir) & FileAttributes.Hidden) == FileAttributes.Hidden))
                
                    findFiles(dir, mask, false,fullname, ref fileCount);
                
            }                 
        }
//java code(查找一個目錄(包括子目錄)下的所有文件):
import java.io.*;

public class ListFiles {
private static String listFileStr = "";
private static String dir;
private static String savefile;
private static int count = 0;

private static FileWriter fw;
private static File saveFile;
public static void main(String[] args) {
   try
   {
    System.out.println("請輸入查找文件的目錄：(eg：d\\:music)");
    try{
       //接收鍵盤輸入作為輸入流，把輸入流放到緩沖流里面
       BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 
       //從緩沖流讀取一行數據
       dir = in.readLine();
       //saveFile=new File(savefile);
    }
    catch(IOException e)
    {
     //System.out.println(e.toString());
     System.out.println("請輸入合法的路徑名！");
    }
    System.out.println("請輸入保存文件的位置：(eg：d\\:savename.txt)");
    try{
       BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
       savefile = in.readLine();
       fw=new FileWriter(savefile);
    }
    catch(IOException ex)
    {
     //System.out.println(ex.toString());
     System.out.println("請輸入合法的路徑名！");
    } 
   
    
    ListFiles lf=new ListFiles();
    lf.listFile(dir);
    fw.write(listFileStr);
    fw.close();
    System.out.println("\n一共找到"+count+"個文件！");
   }
   catch (ArrayIndexOutOfBoundsException ea)
   {
    //參數提示
    System.out.println("Usage: ListFiles <source dir> <target file>");
  
   }
   catch (IOException e)
   {
    System.out.println("IO error!\r\n"+e.toString());
   }
   }
   public void listFile(String rp)
   {
  
    File file=new File(rp);
    File list[]=file.listFiles();
    for(int i=0;i<list.length;i++)
    {
     try
     {
      if (list[i].isDirectory())
      {
       new ListFiles().listFile(list[i].toString());
      }
      else 
      {
       listFileStr+=list[i].getAbsolutePath()+"\r\n";
       System.out.println(list[i].getAbsolutePath());
//       listFileStr+=list[i].getName()+"\r\n";
//       System.out.println(list[i].getName());
       count++;
      }
     }
     catch (Exception ex)
     {
      listFileStr+="Access deny："+list[i].getAbsolutePath()+"\r\n";
      System.out.println("Access deny："+list[i].getAbsolutePath());
    }
   }
}
}

View Code

C# 抓取網頁Html

C# 抓取網頁的Html 及分析：
源碼如下：
private void Search(string url)
{
    string rl;
    WebRequest Request = WebRequest.Create(url.Trim());
 
    WebResponse Response = Request.GetResponse();
 
    Stream resStream = Response.GetResponseStream();
 
    StreamReader sr = new StreamReader(resStream, Encoding.Default);
    StringBuilder sb = new StringBuilder();
    while ((rl = sr.ReadLine()) != null)
    {
        sb.Append(rl);
    }
 
 
    string str = sb.ToString().ToLower();
 
    string str_get = mid(str, "<ul class=\"post_list\">", "</ul>");
 
 
    int start = 0;
    while (true)
    {
        if (str_get == null)
            break;
        string strResult = mid(str_get, "href=\"", "\"", out start);
        if (strResult == null)
            break;
        else
        {
            lab[url] += strResult;
            str_get = str_get.Substring(start);
        }
    }
}
 
 
 
 
private string mid(string istr, string startString, string endString)
{
    int iBodyStart = istr.IndexOf(startString, 0);               //開始位置
    if (iBodyStart == -1)
        return null;
    iBodyStart += startString.Length;                           //第一次字符位置起的長度
    int iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
    if (iBodyEnd == -1)
        return null;
    iBodyEnd += endString.Length;                              //第二次字符位置起的長度
    string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
    return strResult;
}
 
 
private string mid(string istr, string startString, string endString, out int iBodyEnd)
{
    //初始化out參數,否則不能return
    iBodyEnd = 0;
 
    int iBodyStart = istr.IndexOf(startString, 0);               //開始位置
    if (iBodyStart == -1)
        return null;
    iBodyStart += startString.Length;                           //第一次字符位置起的長度
    iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
    if (iBodyEnd == -1)
        return null;
    iBodyEnd += endString.Length;                              //第二次字符位置起的長度
    string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
    return strResult;
}

View Code

C# 抓取網頁里面的所有鏈接

這幾天偶爾看見了，C#抓取網頁的鏈接。的代碼。感覺當時做的很簡單。呵呵。也沒多考慮什么過程。先把簡單的給大家拿出來看看。如果大家有什么意見或者有好的方法可以共同交流。謝謝！一下僅供參考：
 
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;

using System.Xml;
using System.Net;
using System.IO;
using System.Collections;
using System.Text.RegularExpressions;

namespace text
{
    public partial class Form1 : Form
    {
        string strCode;
        ArrayList alLinks;
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            if (textBox1.Text == "")
            {
                MessageBox.Show("請輸入網址");
                return;
            }
            string strURL = textBox1.Text.ToString().Trim();
            if (strURL.Substring(0, 7) != @"http://")
            {
                strURL = @"http://" + strURL;
            }
            MessageBox.Show("正在獲取頁面代碼，請稍后...");
            strCode = GetPageSource(strURL);
            MessageBox.Show("正在提取超鏈接，請稍侯...");
            alLinks = GetHyperLinks(strCode);
            MessageBox.Show("正在寫入文件，請稍侯...");
            WriteToXml(strURL, alLinks);
        }
        // 獲取指定網頁的HTML代碼 
        public static string GetPageSource(string URL)
        {
            Uri uri = new Uri(URL);
            HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
            HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
            hwReq.Method = "Get";
            hwReq.KeepAlive = false;
            StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
            return reader.ReadToEnd();
        }
        // 提取HTML代碼中的網址 
        public static ArrayList GetHyperLinks(string htmlCode)
        {
            ArrayList al = new ArrayList();
            string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(htmlCode);
            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();
                // 過濾重復的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }
                if (!rep) al.Add(strNew);
            }
            al.Sort();
            return al;
        }
        // 把網址寫入xml文件 
        static void WriteToXml(string strURL, ArrayList alHyperLinks)
        {
            XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8);
            writer.Formatting = Formatting.Indented;
            writer.WriteStartDocument(false);
            writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
            writer.WriteComment("提取自" + strURL + "的超鏈接");
            writer.WriteStartElement("HyperLinks");
            writer.WriteStartElement("HyperLinks", null);
            writer.WriteAttributeString("DateTime", DateTime.Now.ToString());

            foreach (string str in alHyperLinks)
            {
                string title = GetDomain(str);
                string body = str;
                writer.WriteElementString(title, null, body);
            }
            writer.WriteEndElement();
            writer.WriteEndElement();
            writer.Flush();
            writer.Close();
        }
        // 獲取網址的域名后綴 
        static string GetDomain(string strURL)
        {
            string retVal;
            string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            Match m = r.Match(strURL);
            retVal = m.ToString();
            strRegex = @"\.|/$";
            retVal = Regex.Replace(retVal, strRegex, "").ToString();
            if (retVal == "")
                retVal = "other";
            return retVal;
        }
    }
}

View Code

C# 抓取網頁內容（轉)

摘要: 1、抓取一般內容需要三個類：WebRequest、WebResponse、StreamReader所需命名空間：System.Net、System.IO核心代碼：view plaincopy to clipboardprint?WebRequestrequest=WebRequest.Create("http://www.cftea. ...
1、抓取一般內容
需要三個類：WebRequest、WebResponse、StreamReader
所需命名空間：System.Net、System.IO
核心代碼：
view plaincopy to clipboardprint?
WebRequest request = WebRequest.Create("http://www.cftea.com/");  
WebResponse response = request.GetResponse();  
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
      WebRequest 類的 Create 為靜態方法，參數為要抓取的網頁的網址；
      Encoding 指定編碼，Encoding 中有屬性 ASCII、UTF32、UTF8 等全球通用的編碼，但沒有 gb2312 這個編碼屬性，所以我們使用 GetEncoding 獲得 gb2312 編碼。
示例：
view plaincopy to clipboardprint?
<%@ Page Language="C#" %>  
<%@ Import Namespace="System.Net" %>  
<%@ Import Namespace="System.IO" %>  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
<mce:script runat="server"><!--  
    void Page_Load(object sender, EventArgs e)  
    {  
        try  
        {  
            WebRequest request = WebRequest.Create("http://www.cftea.com/");  
            WebResponse response = request.GetResponse();  
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
              
            tb.Text = reader.ReadToEnd();  
              
            reader.Close();  
            reader.Dispose();  
            response.Close();  
        }  
        catch (Exception ex)  
        {  
            tb.Text = ex.Message;  
        }  
    }  
// --></mce:script>   
<html xmlns="http://www.w3.org/1999/xhtml" >  
<head runat="server">  
    <title>抓取網頁內容 - 千一網絡</title>  
</head>  
<body>  
    <form id="form1" runat="server">  
    <div>  
    <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
    </div>  
    </form>  
</body>  
</html>  
 
 2 抓取網頁內容－圖片
    需要四個類：WebRequest、WebResponse、Stream、FileStream。
   示例：
view plaincopy to clipboardprint?
<%@ Page Language="C#" %>  
<%@ Import Namespace="System.Net" %>  
<%@ Import Namespace="System.IO" %>  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
<mce:script runat="server"><!--  
    void Page_Load(object sender, EventArgs e)  
    {  
        try  
        {  
            WebRequest request = WebRequest.Create("http://www.cftea.com/images/logo.gif");  
            WebResponse response = request.GetResponse();  
            Stream reader = response.GetResponseStream();  
              
            FileStream writer = new FileStream("D://logo.gif", FileMode.OpenOrCreate, FileAccess.Write);  
            byte[] buff = new byte[512];  
            int c = 0; //實際讀取的字節數   
            while ((c=reader.Read(buff, 0, buff.Length)) > 0)  
            {  
                writer.Write(buff, 0, c);  
            }  
            writer.Close();  
            writer.Dispose();  
              
            reader.Close();  
            reader.Dispose();  
            response.Close();  
              
            tb.Text = "保存成功！";  
        }  
        catch (Exception ex)  
        {  
            tb.Text = ex.Message;  
        }  
    }  
// --></mce:script>   
<html xmlns="http://www.w3.org/1999/xhtml" >  
<head runat="server">  
    <title>抓取網頁圖片並保存 - 千一網絡</title>  
</head>  
<body>  
    <form id="form1" runat="server">  
    <div>  
    <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
    </div>  
    </form>  
</body>  
</html>  
 
3 抓取網頁內容－Post 數據
   在抓取網頁時，有時候，需要將某些數據通過 Post 的方式發送到服務器，將以下代碼添加在網頁抓取的程序中，以實現將用戶名和密碼 Post 到服務器
view plaincopy to clipboardprint?
string data = "userName=admin&passwd=admin888";  
byte[] requestBuffer = System.Text.Encoding.GetEncoding("gb2312").GetBytes(data);  
  
request.Method = "POST";  
request.ContentType = "application/x-www-form-urlencoded";  
request.ContentLength = requestBuffer.Length;  
using (Stream requestStream = request.GetRequestStream())  
{  
    requestStream.Write(requestBuffer, 0, requestBuffer.Length);  
    requestStream.Close();  
}  
  
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")))  
{  
    string str = reader.ReadToEnd();  
    reader.Close();  
}  
 
4  抓取網頁內容－防止重定向
在抓取網頁時，成功登錄服務器應用系統后，應用系統可能會通過 Response.Redirect 將網頁進行重定向，如果不需要響應這個重定向，那么，我們就不要把 reader.ReadToEnd() 給 Response.Write 出來，就可以了。
5 抓取網頁內容－保持登錄狀態
  
利用 Post 數據成功登錄服務器應用系統后，就可以抓取需要登錄的頁面了，那么我們就可能需要在多個 Request 間保持登錄狀態。
首先，我們要使用 HttpWebRequest，而不是 WebRequest。
與 WebRequest 相比，變化的代碼是：
view plaincopy to clipboardprint?
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);  
 
注意：HttpWebRequest.Create 返回的類型仍是 WebRequest，所以要轉化一下。
其次，使用 CookieContainer。
view plaincopy to clipboardprint?
System.Net.CookieContainer cc = new System.Net.CookieContainer();  
request.CookieContainer = cc;  
request2.CookieContainer = cc;   
 
這樣 request 和 request2 之間就使用了相同的 Session，如果 request 登錄了，那么 request2 也是登錄狀態。
最后，如何在不同的頁面間使用同一個 CookieContainer。
要在不同的頁面間使用同一個 CookieContainer，只有把 CookieContainer 加入 Session。
 
view plaincopy to clipboardprint?
Session.Add("ccc", cc); //存   
  
CookieContainer cc = (CookieContainer)Session["ccc"]; //取  
 
5 抓取網頁內容－把當前會話帶到 WebRequest 中
 
比如說瀏覽器 B1 去訪問服務器端 S1，這會產生一個會話，而服務器端 S2 再用 WebRequest 去訪問服務器端 S1，這又會產生一個會話。現在的需求是讓 WebRequest 使用瀏覽器 B1 與 S1 之間的會話，也就是說要讓 S1 認為是 B1 在訪問 S1，而不是 S2 在訪問 S1。
這就要利用 Cookie 了，先在 S1 中取得與 B1 的 SessionID 的 Cookie，再將這個 Cookie 告訴 S2，S2 再將 Cookie 寫在 WebRequest 中。
view plaincopy to clipboardprint?
WebRequest request = WebRequest.Create("url");  
<SPAN class=key>request.Headers.Add(HttpRequestHeader.Cookie, "ASPSESSIONIDSCATBTAD=KNNDKCNBONBOOBIHHHHAOKDM;");</SPAN>  
WebResponse response = request.GetResponse();  
StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));  
Response.Write(reader.ReadToEnd());  
reader.Close();  
reader.Dispose();  
response.Close();  
 
要說明的是：
本文並不是 Cookie 欺騙，因為 SessionID 是 S1 告訴 S2 的，並不是 S2 竊取的，雖然有些古怪，但這可能在一些特定的應用系統中會有用。
S1 必須要向 B1 寫 Session，這樣 SessionID 才會保存到 Cookie 中，並且 SessionID 才會保持不變。
在 ASP.NET 中取 Cookie 用 Request.Cookies，本文假設 Cookie 已經取出來。
不同的服務器端語言，SessionID 在 Cookie 中上名稱並不一樣，本文是 ASP 的 SessionID。
S1 可能不僅僅依靠 SessionID 來判斷當前登錄，它可能還會輔助於 Referer、User-Agent 等，這取決於 S1 端程序的設計。
其實本文算是本連載中“保持登錄狀態”的另一種方法。
6 抓取網頁內容－如何更改來源 Referer 和 UserAgent
view plaincopy to clipboardprint?
<SPAN class=caution>HttpWebRequest</SPAN> request = <SPAN class=caution>(HttpWebRequest)HttpWebRequest</SPAN>.Create("http://127.0.0.1/index.htm");  
//request.Headers.Add(HttpRequestHeader.Referer, "http://www.cftea.com/"); // 錯誤   
//request.Headers[HttpRequestHeader.Referer] = "http://www.cftea.com/"; // 錯誤   
<SPAN class=caution>request.Referer</SPAN> = "http://www.cftea.com/"; // 正確  
 
注釋掉的兩句是不對的，會發生錯誤：
view plaincopy to clipboardprint?
此標頭必須使用適當的屬性進行修改。  
參數名: name   
 
UserAgent 類似。

View Code

C#抓取和分析網頁的類

抓取和分析網頁的類。

主要功能有：

1、提取網頁的純文本，去所有html標簽和javascript代碼

2、提取網頁的鏈接，包括href和frame及iframe

3、提取網頁的title等(其它的標簽可依此類推，正則是一樣的)

4、可以實現簡單的表單提交及cookie保存

 /*

*  Author:Sunjoy at CCNU

*  如果您改進了這個類請發一份代碼給我(ccnusjy 在gmail.com)

*/



using System;

using System.Data;

using System.Configuration;

using System.Net;

using System.IO;

using System.Text;

using System.Collections.Generic;

using System.Text.RegularExpressions;

using System.Threading;

using System.Web;

/// <summary>

/// 網頁類

/// </summary>

public class WebPage

{



    #region 私有成員

    private Uri m_uri;   //網址

    private List<Link> m_links;    //此網頁上的鏈接

    private string m_title;        //此網頁的標題

    private string m_html;         //此網頁的HTML代碼

    private string m_outstr;       //此網頁可輸出的純文本

    private bool m_good;           //此網頁是否可用

    private int m_pagesize;       //此網頁的大小

    private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有網頁的Cookie

    private string m_post;  //此網頁的登陸頁需要的POST數據

    private string m_loginurl;  //此網頁的登陸頁

    #endregion





    #region 私有方法

    /// <summary>

    /// 這私有方法從網頁的HTML代碼中分析出鏈接信息

    /// </summary>

    /// <returns>List<Link></returns>

    private List<Link> getLinks()

    {

        if (m_links.Count == 0)

        {

            Regex[] regex = new Regex[2];

            regex[0] = new Regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            for (int i = 0; i < 2; i++)

            {

                Match match = regex[i].Match(m_html);

                while (match.Success)

                {

                    try

                    {

                        string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;

                        string text = "";

                        if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|(&nbsp;)|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");

                        Link link = new Link(url, text);

                        m_links.Add(link);

                    }

                    catch(Exception ex){Console.WriteLine(ex.Message); };

                    match = match.NextMatch();

                }

            }

        }

        return m_links;

    }

   

    /// <summary>

    /// 此私有方法從一段HTML文本中提取出一定字數的純文本

    /// </summary>

    /// <param name="instr">HTML代碼</param>

    /// <param name="firstN">提取從頭數多少個字</param>

    /// <param name="withLink">是否要鏈接里面的字</param>

    /// <returns>純文本</returns>

    private string getFirstNchar(string instr, int firstN, bool withLink)

    {

        if (m_outstr == "")

        {

            m_outstr = instr.Clone() as string;

            m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

            Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            m_outstr = objReg.Replace(m_outstr, "");

            Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            m_outstr = objReg2.Replace(m_outstr, " ");

        }

        return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;

    }



    /// <summary>

    /// 此私有方法返回一個IP地址對應的無符號整數

    /// </summary>

    /// <param name="x">IP地址</param>

    /// <returns></returns>

    private uint getuintFromIP(IPAddress x)

    {

        Byte[] bt = x.GetAddressBytes();

        uint i = (uint)(bt[0] * 256 * 256 * 256);

        i += (uint)(bt[1] * 256 * 256);

        i += (uint)(bt[2] * 256);

        i += (uint)(bt[3]);

        return i;

    }



    #endregion





    #region 公有文法

    /// <summary>

    /// 此公有方法提取網頁中一定字數的純文本，包括鏈接文字

    /// </summary>

    /// <param name="firstN">字數</param>

    /// <returns></returns>

    public string getContext(int firstN)

    {

        return getFirstNchar(m_html, firstN, true);

    }



    /// <summary>

    /// 此公有方法提取網頁中一定字數的純文本，不包括鏈接文字

    /// </summary>

    /// <param name="firstN"></param>

    /// <returns></returns>

    public string getContextWithOutLink(int firstN)

    {

        return getFirstNchar(m_html, firstN, false);

    }



    /// <summary>

    /// 此公有方法從本網頁的鏈接中提取一定數量的鏈接，該鏈接的URL滿足某正則式

    /// </summary>

    /// <param name="pattern">正則式</param>

    /// <param name="count">返回的鏈接的個數</param>

    /// <returns>List<Link></returns>

    public List<Link> getSpecialLinksByUrl(string pattern,int count)

    {

        if(m_links.Count==0)getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        int cnt = 0;

        while (i.MoveNext() && cnt<count)

        {

            if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success)

            {

                SpecialLinks.Add(i.Current);

                cnt++;

            }

        } 

        return SpecialLinks;

    }







    /// <summary>

    /// 此公有方法從本網頁的鏈接中提取一定數量的鏈接，該鏈接的文字滿足某正則式

    /// </summary>

    /// <param name="pattern">正則式</param>

    /// <param name="count">返回的鏈接的個數</param>

    /// <returns>List<Link></returns>

    public List<Link> getSpecialLinksByText(string pattern,int count)

    {

        if (m_links.Count == 0) getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        int cnt = 0;

        while (i.MoveNext() && cnt < count)

        {

            if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success)

            {

                SpecialLinks.Add(i.Current);

                cnt++;

            }

        }

        return SpecialLinks;

    }

    /// <summary>

    /// 此公有方法獲得所有鏈接中在一定IP范圍的鏈接

    /// </summary>

    /// <param name="_ip_start">起始IP</param>

    /// <param name="_ip_end">終止IP</param>

    /// <returns></returns>

    public List<Link> getSpecialLinksByIP(string _ip_start, string _ip_end)

    {

        IPAddress ip_start = IPAddress.Parse(_ip_start);

        IPAddress ip_end = IPAddress.Parse(_ip_end);

        if (m_links.Count == 0) getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        while (i.MoveNext())

        {

            IPAddress ip;

            try

            {

                ip = Dns.GetHostEntry(new Uri(i.Current.url).Host).AddressList[0];

            }

            catch { continue; }

            if(getuintFromIP(ip)>=getuintFromIP(ip_start) && getuintFromIP(ip)<=getuintFromIP(ip_end))

            {

                SpecialLinks.Add(i.Current);

            }

        }

        return SpecialLinks;

    }



    /// <summary>

    /// 這公有方法提取本網頁的純文本中滿足某正則式的文字

    /// </summary>

    /// <param name="pattern">正則式</param>

    /// <returns>返回文字</returns>

    public string getSpecialWords(string pattern)

    {

        if (m_outstr == "") getContext(Int16.MaxValue);

        Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase );

        Match mc=regex.Match(m_outstr);

        if (mc.Success)

            return mc.Groups[1].Value;

        return string.Empty;

    }

    #endregion





    #region 構造函數

    

    private void Init(string _url)

    {

   

        try

        {

            m_uri = new Uri(_url);

            m_links = new List<Link>();

            m_html = "";

            m_outstr = "";

            m_title = "";

            m_good = true;

            if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))

            {

                m_good = false;

                return;

            }

            HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);

            rqst.AllowAutoRedirect = true;

            rqst.MaximumAutomaticRedirections = 3;

            rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

            rqst.KeepAlive = true;

            rqst.Timeout = 30000;

            lock (WebPage.webcookies)

            {

                if (WebPage.webcookies.ContainsKey(m_uri.Host))

                    rqst.CookieContainer = WebPage.webcookies[m_uri.Host];

                else

                {

                    CookieContainer cc = new CookieContainer();

                    WebPage.webcookies[m_uri.Host] = cc;

                    rqst.CookieContainer = cc;

                }

            }



            HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();



            Stream sm = rsps.GetResponseStream();

            if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)

            {

                rsps.Close();

                m_good = false;

                return;

            }

            Encoding cding = System.Text.Encoding.Default;

            string contenttype=rsps.ContentType.ToLower();

            int ix = contenttype.IndexOf("charset=");

            if (ix != -1)

            {



                try

                {

                    cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));

                }

                catch

                {

                    cding = Encoding.Default;

                }

                m_html = new StreamReader(sm, cding).ReadToEnd();

            }

            else

            {

                m_html = new StreamReader(sm, cding).ReadToEnd();

                Regex regex = new Regex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase);

                string strcding = regex.Match(m_html).Groups["cding"].Value;

                try

                {

                    cding = Encoding.GetEncoding(strcding);

                }

                catch{

                    cding = Encoding.Default;

                }

                byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());

                m_html = cding.GetString(bytes);

                if (m_html.Split('?').Length > 100)

                {

                    m_html=Encoding.Default.GetString(bytes);

                }

            }



            

            m_pagesize = m_html.Length;

            m_uri = rsps.ResponseUri;

            rsps.Close();

        }

        catch (Exception ex)

        {

            Console.WriteLine(ex.Message+m_uri.ToString());

            m_good = false;

            

        }

    }



    public WebPage(string _url)

    {

        string uurl = "";

        try

        {

            uurl = Uri.UnescapeDataString(_url);

            _url = uurl;

        }

        catch { };

        Regex re = new Regex("(?<h>[^\x00-\xff]+)");

        Match mc = re.Match(_url);

        if (mc.Success)

        {

            string han = mc.Groups["h"].Value;

            _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));

        }



        Init(_url);

    }



    public WebPage(string _url, string _loginurl, string _post)

    {

        string uurl = "";

        try

        {

            uurl = Uri.UnescapeDataString(_url);

            _url = uurl;

        }

        catch { };

        Regex re = new Regex("(?<h>[^\x00-\xff]+)");

        Match mc = re.Match(_url);

        if (mc.Success)

        {

            string han = mc.Groups["h"].Value;

            _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));

        }

        if (_loginurl.Trim() == "" || _post.Trim() == "" || WebPage.webcookies.ContainsKey(new Uri(_url).Host))

        {

            Init(_url);

        }

        else

        {

            #region 登陸

            string indata = _post;

            m_post = _post;

            m_loginurl = _loginurl;

            byte[] bytes = Encoding.Default.GetBytes(_post);

            CookieContainer myCookieContainer = new CookieContainer();

            try

            {



                //新建一個CookieContainer來存放Cookie集合 



                HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl);

                //新建一個HttpWebRequest 

                myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";

                myHttpWebRequest.AllowAutoRedirect = false;

                myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

                myHttpWebRequest.Timeout = 60000;

                myHttpWebRequest.KeepAlive = true;

                myHttpWebRequest.ContentLength = bytes.Length;

                myHttpWebRequest.Method = "POST";

                myHttpWebRequest.CookieContainer = myCookieContainer;

                //設置HttpWebRequest的CookieContainer為剛才建立的那個myCookieContainer 

                Stream myRequestStream = myHttpWebRequest.GetRequestStream();

                myRequestStream.Write(bytes, 0, bytes.Length);

                myRequestStream.Close();

                HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();



                foreach (Cookie ck in myHttpWebResponse.Cookies)

                {

                    myCookieContainer.Add(ck);

                }

                myHttpWebResponse.Close();

            }

            catch

            {

                Init(_url);

                return;

            }



            #endregion



            #region 登陸后再訪問頁面

            try

            {

                m_uri = new Uri(_url);

                m_links = new List<Link>();

                m_html = "";

                m_outstr = "";

                m_title = "";

                m_good = true;

                if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))

                {

                    m_good = false;

                    return;

                }

                HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);

                rqst.AllowAutoRedirect = true;

                rqst.MaximumAutomaticRedirections = 3;

                rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

                rqst.KeepAlive = true;

                rqst.Timeout = 30000;

                rqst.CookieContainer = myCookieContainer;

                lock (WebPage.webcookies)

                {

                    WebPage.webcookies[m_uri.Host] = myCookieContainer;

                }

                HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();



                Stream sm = rsps.GetResponseStream();

                if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)

                {

                    rsps.Close();

                    m_good = false;

                    return;

                }

                Encoding cding = System.Text.Encoding.Default;

                int ix = rsps.ContentType.ToLower().IndexOf("charset=");

                if (ix != -1)

                {

                    try

                    {

                        cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));

                    }

                    catch

                    {

                        cding = Encoding.Default;

                    }

                }





                m_html = new StreamReader(sm, cding).ReadToEnd();





                m_pagesize = m_html.Length;

                m_uri = rsps.ResponseUri;

                rsps.Close();

            }

            catch (Exception ex)

            {

                Console.WriteLine(ex.Message+m_uri.ToString());

                m_good = false;

            

            }

            #endregion

        }



    }



    #endregion





    #region 屬性



    /// <summary>

    /// 通過此屬性可獲得本網頁的網址，只讀

    /// </summary>

    public string URL

    {

        get

        {

            return m_uri.AbsoluteUri;

        }

    }



    /// <summary>

    /// 通過此屬性可獲得本網頁的標題，只讀

    /// </summary>

    public string Title

    {

        get

        {

            if (m_title == "")

            {

                Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase );

                Match mc = reg.Match(m_html);

                if (mc.Success)

                    m_title= mc.Groups["title"].Value.Trim();

            }

            return m_title;

        }

    }

  



    /// <summary>

    /// 此屬性獲得本網頁的所有鏈接信息，只讀

    /// </summary>

    public List<Link> Links

    {

        get

        {

            if (m_links.Count == 0) getLinks();

            return m_links;

        }

    }





    /// <summary>

    /// 此屬性返回本網頁的全部純文本信息，只讀

    /// </summary>

    public string Context

    {

       get

       {

           if (m_outstr == "") getContext(Int16.MaxValue);

           return m_outstr;

       }

    }



    /// <summary>

    /// 此屬性獲得本網頁的大小

    /// </summary>

    public int PageSize

    {

        get

        {

            return m_pagesize;

        }

    }

    /// <summary>

    /// 此屬性獲得本網頁的所有站內鏈接

    /// </summary>

    public List<Link> InsiteLinks

    {

        get

        {

            return getSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);

        }

    }



    /// <summary>

    /// 此屬性表示本網頁是否可用

    /// </summary>

    public bool IsGood

    {

        get

        {

            return m_good;

        }

    }

    /// <summary>

    /// 此屬性表示網頁的所在的網站

    /// </summary>

    public string Host

    {

        get

        {

            return m_uri.Host;

        }

    }

    



    /// <summary>

    /// 此網頁的登陸頁所需的POST數據

    /// </summary>

    public string PostStr

    {

        get

        {

            return m_post;

        }

    }

    /// <summary>

    /// 此網頁的登陸頁

    /// </summary>

    public string LoginURL

    {

        get

        {

            return m_loginurl;

        }

    }

    #endregion

}



/// <summary>

/// 鏈接類

/// </summary>

public class Link

{

    public string url;   //鏈接網址

    public string text;  //鏈接文字

    public Link(string _url, string _text)

    {

        url = _url;

        text = _text;

    }

}

View Code

C#抓取網頁信息

景
　　隨着Internet的普及，網絡信息正以極高的速度增長，在這么多數據中找到自己需要的信息是一件很繁瑣的事情，找到需要的信息后如何獲取也是件麻煩的事。這 就需要Internet信息抓取程序來代替人工的操作。
　　所謂Internet信息抓取程序，就是程序會按照用戶的關鍵詞或關鍵網站來收集相應的信息，並提供給用戶想要的信息格式 。
　　信息量的增加會帶來信息網站發布人員工作量的劇增，為實現信息發布系統實現信息自
　　動發布、減少工作人員工作量、即時跟蹤最新信息，就需要自動信息提供 程序，因此Internet信息抓取程序應運而生。
　　目標
　　實現自定義網站信息分類抓取，存入本地數據庫、生成靜態頁面或其它用戶定義的信息結構，並下載與信息相關 的多媒體文件。
　　開發
　　目標站點結構分析
　　本步驟是准確抓取信息個關鍵。
　　首先要選擇更新頻 率高的頁面做為抓取地址，然后分析要抓取內容頁面url特點。
　　然后分析要抓取信息頁面的元素特性，比如標題位置，內容位置 等，得到定位標記點。
　　將以上信息 寫成自己的配置文件或存到數據庫中。
　　每個網站都需要分析，寫出單獨的配置文件，供抓取程序使用。
　　信息提取
　　根據配置文件取得要抓取頁面url，使用HttpWebRequest類獲取內容：
雙擊代碼全選
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//獲取http頁面函數
　　　　　　　 public string Get_Http(string a_strUrl,int timeout)
　　　　　　　 {
　　　　　　　　　　　 string strResult ;　　　　　　　　
　　　　　　　　　　　 try
　　　　　　　　　　　 {
HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
　　　　　　　　　　　　　　　 myReq.Timeout = timeout;
　　　　　　　　　　　　　　　 HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
　　　　　　　　　　　
　　　　　　　　　　　　　　　 Stream myStream = HttpWResp.GetResponseStream () ;
　　　　　　　　　　　　　　　 StreamReader sr = new StreamReader (myStream , Encoding.Default);
　　　　　　　　　　　　　　　 StringBuilder strBuilder = new StringBuilder();
　　　　　　　　　　　　　　　 while (-1 != sr.Peek())
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 strBuilder.Append (sr.ReadLine()+"
");
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 strResult = strBuilder.ToString ();
　　　　　　　　　　　 }
　　　　　　　　　　　 catch(Exception exp)
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 strResult = "錯誤：" + exp.Message ;
　　　　　　　　　　　 }
　　　　　　　　　　　 return strResult ;
　　　　　　　 }
　　獲取頁面內容后，分析頁面中連接地址取到要抓取的url：
雙擊代碼全選
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//處理頁面標題和鏈接
　　　　　　　 public string SniffWebUrl( string urlStr,string blockB,string blockE )
　　　　　　　 {　　　　　　
　　　　　　　　　　　 string urlch1 = "";
　　　　　　　　　　　 string urlch2 = "";　　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　 int end_n1 = 0;
　　　　　　　　　　　 int end_nums = 0;
　　　　　　　　　　　 int end_nums1 = 0;
　　　　　　　　　　　 int end_nums2 = 0;
　　　　　　　　　　　 int end_nums3　　　　 = 0;　　　　　　　　　　　
　　　　　　　　　　　 string reUTStr = "";
　　　　　　　　　　　 string reTitle = "";
　　　　　　　　　　　 string ret = "";　　　　　　　　　　
　　　　　　　　　　　 try
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 int pos01 = urlStr.IndexOf( "." );
　　　　　　　　　　　　　　　 int pos02 = urlStr.LastIndexOf( "/" );
　　　　　　　　　　　　　　　 if( pos01 < 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 if( pos02 < 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 int pos03 = urlStr.IndexOf( "/",pos01 );
　　　　　　　　　　　　　　　 if ( pos03 < 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 urlch1 = urlStr;
　　　　　　　　　　　　　　　　　　　 urlch2 = urlStr;
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 urlch1 = urlStr.Substring( 0,pos03 );
　　　　　　　　　　　　　　　　　　　 urlch2 = urlStr.Substring( 0,pos02 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1);
　　　　　　　　　　　　　　　 int pos1 = tmpAllStr.IndexOf( blockB );
　　　　　　　　　　　　　　　 int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length );
　　　　　　　　　　　　　　　 if ( pos1>0 && pos2>0 && pos2>pos1 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length );
　　　　　　　　　　　　　　　　　　　 ret = ret.Substring( ret.IndexOf( "<" ));
　　　　　　　　　　　　　　　　　　　 while( ret.IndexOf( "<A" ) >= 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 while( ret.IndexOf( "</A" ) >=0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 while( ret.IndexOf( "Href=" ) >=0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 while( ret.IndexOf( "HREF=" ) >=0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 while( ret.IndexOf( "href='" ) >=0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href="" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }　　　　　　
　　　　　　　　　　　　　　　 tmpAllStr = ret;　　　　　
　　　　　　　　　　　　　　　 int begin_nums = tmpAllStr.IndexOf( "href=" );
　　　　　　　　　　　　　　　 while ( begin_nums >= 0 )
　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 string tmpStrA = "";
　　　　　　　　　　　　　　　　　　　 string tmpStrB = tmpAllStr.Substring( begin_nums + 5,1 );
　　　　　　　　　　　　　　　　　　　 if ( tmpStrB == """ )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_n1 = begin_nums + 6;
　　　　　　　　　　　　　　　　　　　　　　&nb sp; if ( ( end_n1 + 1 ) > tmpAllStr.Length )
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 return "";
　　　　　　　　　　　　　　　　　　　　&nbs p;　　 }
　　　　　　　　　　　　　　　　　　　　　　　 tmpStrA = tmpAllStr.Substring( begin_nums+6,1 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_n1 = begin_nums + 5;
　　　　　　　　　　　　　　　　　　　　　　&nb sp; tmpStrA = tmpStrB;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 if ( tmpStrA == "#" )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; tmpAllStr = tmpAllStr.Substring( end_n1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; begin_nums = tmpAllStr.IndexOf( "href=" );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　　　　　 end_nums1 = tmpAllStr.IndexOf( " ",end_n1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; end_nums2 = tmpAllStr.IndexOf( ">",end_n1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; end_nums3 = tmpAllStr.IndexOf( "</a",end_nums2 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; if ( ( end_nums3 >= 0 ) && ( end_nums2 >= 0 ) )
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 reTitle = tmpAllStr.Substring( end_nums2 + 1,end_nums3 - end_nums2 - 1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 if ( end_nums1 > end_nums2 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 end_nums = end_nums2;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 if ( end_nums1 < 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　 end_nums = end_nums2;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　 end_nums = end_nums1;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 string str4 = tmpAllStr.Substring( end_nums-1, end_nums - end_nums + 1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 if ( str4 =="""　 || str4 == "'" )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 end_nums = end_nums - 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 string sTotalOne = tmpAllStr.Substring( end_n1,end_nums - end_n1 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 if ( sTotalOne.IndexOf( "http://" ) <0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 if ( sTotalOne.IndexOf( "/" ) == 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　 sTotalOne = urlch1 + sTotalOne;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　 {　　　　　　　　　　　　　　　　　　　　　　　　　　 　　　　
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 int linshiIntNum = 0;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　 int flags = 0;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　 string urlChange = urlStr;;
　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　　 while( sTotalOne.IndexOf( "../" ) >= 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　　　　　 sTotalOne = sTotalOne.Substring( sTotalOne.IndexOf( "../" ) + 3 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　　　　　 linshiIntNum = linshiIntNum + 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　　　　　 flags = flags +1;
　　　　　　　　　　　　　　　　　　　　　　&n bsp;　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　　　　　 urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　　　　　 linshiIntNum = linshiIntNum - 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 if ( flags == 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　　　　　 sTotalOne = urlch2 + "/" + sTotalOne;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　　　　　　　　　 sTotalOne = urlChange + "/" + sTotalOne;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 reUTStr = reUTStr + new PublicFun().RemoveHtmlCode( reTitle ) + sTotalOne;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　 tmpAllStr = tmpAllStr.Substring( end_nums3 + 4 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 begin_nums = tmpAllStr.IndexOf( "href=" );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; }
　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp; {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 begin_nums = - 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp; }　　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 return reUTStr;
　　　　　　　　　　　 }
　　　　　　　　　　　 catch( Exception e)
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　 }
　　　　　　　 }


得到要抓取內容的url后，處理該頁面：
雙擊代碼全選
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
//獲取鏈接內容並分類處理
　　　　　　　 public string GetWebContent( string gatherUrl,string subUrl,string subTitle,string b_Content,string e_Content,string b_Filter,string e_Filter,string root )
　　　　　　　 {
　　　　　　　　　　　 string tmpAllStr = "";　　　　　　　　　　　
　　　　　　　　　　　 string dfStrB = "";
　　　　　　　　　　　 string dfStrE = "";　　　　　　　　　　　　　　　
　　　　　　　　　　　 string rePicStr = "";//圖片返回路徑　　　
　　　　　　　　　　　 string reContentStr = "";
　　　　　　　　　　　 string picHtml = "images"; //本地圖片路徑
　　　　　　　　　　　
　　　　　　　　　　　 string urlch1 ="";
　　　　　　　　　　　 string urlch2 ="";
　　　　　　　　　　　 int pos1 = gatherUrl.IndexOf( "." );
　　　　　　　　　　　 int pos2 = gatherUrl.LastIndexOf( "/" );
　　　　　　　　　　　 if( pos1 < 0 )
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　 }
　　　　　　　　　　　 if( pos2 < 0 )
　　　　　　　　　　　 {　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　 }
　　　　　　　　　　　 int pos3 = gatherUrl.IndexOf( "/",pos1 );
　　　　　　　　　　　 if ( pos3 < 0 )
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 urlch1 = gatherUrl;
　　　　　　　　　　　　　　　 urlch2 = gatherUrl;
　　　　　　　　　　　 }
　　　　　　　　　　　 else
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 urlch1 = gatherUrl.Substring( 0,pos3 );
　　　　　　　　　　　　　　　 urlch2 = gatherUrl.Substring( 0,pos2 );
　　　　　　　　　　　 }　　　
　　　　　　　　　　　
　　　　　　　　　　　 tmpAllStr = new PublicFun().Get_Http( subUrl,time1 );
　　　　　　　　　　　 //取稿源
　　　　　　　　　　　 string docFromStr = "";
　　　　　　　　　　　 if ( dfStrB != "" && dfStrE != "" )
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 if ( tmpAllStr != "" )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 int b_docF = tmpAllStr.IndexOf( dfStrB );
　　　　　　　　　　　　　　　　　　　 if ( b_docF > 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; int e_docF = tmpAllStr.IndexOf( dfStrE,b_docF + dfStrB.Length );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; if ( e_docF > 0 && e_docF > b_docF && e_docF - b_docF < 20 )
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 docFromStr = tmpAllStr.Substring( b_docF + dfStrB.Length, e_docF - b_docF - dfStrB.Length );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; }
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　 }
　　　　　　　　　　　 //取內容
　　　　　　　　　　　 if ( tmpAllStr != "" )
　　　　　　　　　　　 {　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 int begin_strnum = tmpAllStr.IndexOf( b_Content );
　　　　　　　　　　　　　　　 if ( begin_strnum < 0 )
　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 int end_strnum = tmpAllStr.IndexOf( e_Content,begin_strnum + b_Content.Length );
　　　　　　　　　　　　　　　 if ( end_strnum < 0 )
　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 string sTotalSubM = "";
　　　　　　　　　　　　　　　 if ( end_strnum > begin_strnum )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = tmpAllStr.Substring ( begin_strnum,end_strnum - begin_strnum );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 if ( sTotalSubM == "" )
　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 return "";
　　　　　　　　　　　　　　　 } 　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 //過濾無用信息
　　　　　　　　　　　　　　　 int bfnum = sTotalSubM.IndexOf( b_Filter );
　　　　　　　　　　　　　　　 if ( bfnum > -1 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 int efnum = sTotalSubM.IndexOf( e_Filter,bfnum );
　　　　　　　　　　　　　　　　　　　 if ( efnum > -1 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; if ( efnum > bfnum )
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 sTotalSubM = sTotalSubM.Substring( 0,bfnum ) + sTotalSubM.Substring( efnum + e_Filter.Length );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; }
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 //格式化圖片標記
　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "Src=" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "Src=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "Src=" ) + 4 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "SRC=" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "SRC=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "SRC=" ) + 4 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "src='" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "src='" ) ) + "src="" + sTotalSubM.Substring( sTotalSubM.IndexOf( "src='" ) + 5 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 //取圖片地址
　　　　　　　　　　　　　　　 int end_n12 = 0;
　　　　　　　　　　　　　　　 int end_nums2 = 0;
　　　　　　　　　　　　　　　 int begin_nums2 = sTotalSubM.IndexOf( "src=" );
　　　　　　　　　　　　　　　 while( begin_nums2 >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 String tmpStr = sTotalSubM.Substring( begin_nums2 + 4,1 );
　　　　　　　　　　　　　　　　　　　 if ( tmpStr == """ )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_n12 = begin_nums2 + 5;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_n12 = begin_nums2 + 4;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 int end_nums2a = sTotalSubM.IndexOf( " ",end_n12 );
　　　　　　　　　　　　　　　　　　　 int end_nums2b = sTotalSubM.IndexOf( ">",end_n12 );
　　　　　　　　　　　　　　　　　　　 if ( end_nums2b < 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; break;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 if ( end_nums2a > end_nums2b )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_nums2 = end_nums2b;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; if (end_nums2a<0)
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 end_nums2 = end_nums2b;
　　　　　　　　　　　　　　　　　　　　　 　　 }
　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp; {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 end_nums2 = end_nums2a;
　　　　　　　　　　　　　　　　　　　　　 　　 }
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 tmpStr = sTotalSubM.Substring( end_nums2-1,1 );
　　　　　　　　　　　　　　　　　　　 if ( tmpStr == """ || tmpStr == "'" )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; end_nums2 = end_nums2 - 1;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 string tmpPicStr = sTotalSubM.Substring( end_n12,end_nums2 - end_n12 );
　　　　　　　　　　　　　　　　　　　 if ( tmpPicStr.IndexOf( "http://" ) < 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; if ( tmpPicStr.IndexOf( "/" ) == 0 )
　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　 tmpPicStr = urlch1 + tmpPicStr;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　 }
　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp; {　　　　　　　　　　　　　　　　　　　　　　　　　　 　
　　　　　　　　　　　　　　　　　　　　　　　　　　　 int linshiIntNum = 0;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 int flags = 0;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 string urlChange = subUrl;
　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　 while( tmpPicStr.IndexOf( "../" ) >= 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 tmpPicStr = tmpPicStr.Substring( tmpPicStr.IndexOf("../") + 3 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　 linshiIntNum = linshiIntNum + 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　 flags = flags + 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　　　　　 linshiIntNum = linshiIntNum - 1;
　　　　　　　　　　　　　　　　　　　　　　&nb sp;　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 if ( flags == 0 )
　　　　　　　　　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 tmpPicStr = urlch2 + "/" + tmpPicStr;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p;　　　　　　　　 tmpPicStr = urlChange + "/" + tmpPicStr;
　　　　　　　　　　　　　　　　　　　　　& nbsp;　　　　　 }
　　　　　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 //tmpPicStr = tmpPicStr.ToLower();
　　　　　　　　　　　　　　　　　　　 string tmpPicStrTmp = tmpPicStr.ToLower ();
　　　　　　　　　　　　　　　　　　　 //if ( tmpPicStr.IndexOf( ".jpg" ) > 0 || tmpPicStr.IndexOf( ".gif" ) > 0 || tmpPicStr.IndexOf( ".bmp" ) > 0 )
　　　　　　　　　　　　　　　　　　　 if ( tmpPicStrTmp.IndexOf( ".jpg" ) > 0 || tmpPicStrTmp.IndexOf( ".gif" ) > 0 || tmpPicStrTmp.IndexOf( ".bmp" ) > 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; rePicStr = rePicStr + "||" + tmpPicStr ;
　　　　　　　　　　　　　　　　　　　　　　&nbs p; int flagN2 = tmpPicStr.LastIndexOf( "/" );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; string fileN2 = picHtml + tmpPicStr.Substring( flagN2 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; sTotalSubM = sTotalSubM.Substring( 0,end_nums2 ) + ">******" + fileN2 + "******<" + sTotalSubM.Substring( end_nums2 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + fileN2.Length + 22 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + 4 );　　　　　　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　 } 　　　　　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 if ( rePicStr.Length > 2 )　
　　　　　　　　　　　　　　　　　　　 rePicStr =　 rePicStr.Substring(2);　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 //內容處理 格式化關鍵標記
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<P" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<P" ) + 2 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<p" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<p" ) + 2 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "</P" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</P" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "</p" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</p" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<br" ) >=0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<br" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<BR" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<BR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<BR" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<Br" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<Br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<Br" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "<bR" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<bR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<bR" ) + 3 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 //去除html標記
　　　　　　　　　　　　　　　 int linshiInt1 = sTotalSubM.IndexOf( "<" );
　　　　　　　　　　　　　　　 int linshiInt2 = sTotalSubM.IndexOf( ">" );　　　　　　　　　　　
　　　　　　　　　　　　　　　 if ( linshiInt2 < linshiInt1 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( linshiInt2 + 1 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 int linshiInt11 = sTotalSubM.LastIndexOf( "<" );
　　　　　　　　　　　　　　　 int linshiInt12 = sTotalSubM.LastIndexOf( ">" );
　　　　　　　　　　　　　　　 if ( linshiInt12 < linshiInt11 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,linshiInt12 + 1 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 linshiInt1 = sTotalSubM.IndexOf( "<" );
　　　　　　　　　　　　　　　 while ( linshiInt1 >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 linshiInt2 = sTotalSubM.IndexOf( ">",linshiInt1 );
　　　　　　　　　　　　　　　　　　　 if ( linshiInt2 >= 0 )
　　　　　　　　　　　　　　　　　　　 {　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 ) + sTotalSubM.Substring( linshiInt2 + 1 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 linshiInt1 = sTotalSubM.IndexOf("<");
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 //還原關鍵標記
　　　　　　　　　　　　　　　 int linshiInt3 = 0;
　　　　　　　　　　　　　　　 int linshiInt4 = 0;
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "+****+" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "+****+" ) ) + "<br>
" + sTotalSubM.Substring( sTotalSubM.IndexOf( "+****+" ) + 9 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "|****|" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "|****|" ) ) + "<br>
" + sTotalSubM.Substring( sTotalSubM.IndexOf( "|****|" ) + 9 );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 while( sTotalSubM.IndexOf( "******" ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 linshiInt3 = sTotalSubM.IndexOf( "******" ) + 9;
　　　　　　　　　　　　　　　　　　　 linshiInt4 = sTotalSubM.IndexOf( "******",linshiInt3 );
　　　　　　　　　　　　　　　　　　　 if ( linshiInt4 >= 0 )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; int tmpPos = sTotalSubM.IndexOf( "******" );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; string tmpStr1 = sTotalSubM.Substring( 0,tmpPos );
　　　　　　　　　　　　　　　　　　　　　　　 string tmpStr2 = sTotalSubM.Substring( linshiInt3,linshiInt4 - linshiInt3 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; string tmpStr3 = sTotalSubM.Substring( linshiInt4 + 9 );
　　　　　　　　　　　　　　　　　　　　　　&nb sp; sTotalSubM = tmpStr1 + "<img src=" + tmpStr2 + ">" + tmpStr3;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　　　　　 else
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; break;
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 //去除內容中的標題
　　　　　　　　　　　　　　　 if ( sTotalSubM.IndexOf( subTitle ) >= 0 )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( subTitle ) ) + sTotalSubM.Substring( sTotalSubM.IndexOf( subTitle ) + subTitle.Length );
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 reContentStr = sTotalSubM;
　　　　　　　　　　　　　　　 //調用下載圖片功能
　　　　　　　　　　　　　　　 //下載圖片到指定目錄
　　　　　　　　　　　　　　　 string[] img_Url = new PublicFun().split( rePicStr,"||" );
　　　　　　　　　　　　　　　 for ( int i=0;i<img_Url.Length;i++ )
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 if ( img_Url[i] != "" )
　　　　　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　　　　&nbs p; new PublicFun().Get_Img( img_Url[i],10000,root + "images" + img_Url[i].Substring( img_Url[i].LastIndexOf("/")+1 ) );
　　　　　　　　　　　　　　　　　　　 }
　　　　　　　　　　　　　　　 }
　　　　　　　　　　　 }
　　　　　　　　　　　 return reContentStr;
　　　　　　　 }

以上方法返回要取得的信息，包括標題內容，圖片地址等。
　　下載頁面中圖片：
雙擊代碼全選
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//下載圖片
　　　　　　　 public void Get_Img(string a_strUrl,int timeout,string filepath)
　　　　　　　 {
　　　　　　　　　　　 try
　　　　　　　　　　　 {
HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
　　　　　　　　　　　　　　　 myReq.Timeout = timeout;
　　　　　　　　　　　　　　　 HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();　　　　　　　　
　　　　　　　　　　　　　　　 Stream myStream = HttpWResp.GetResponseStream () ;　　　　　　　　　
　　　　　　　　　　　　　　　 Bitmap map = new Bitmap( myStream );
　　　　　　　　　　　　　　　 PictureBox picB = new PictureBox ();
　　　　　　　　　　　　　　　 picB.Image = (Image) map;
　　　　　　　　　　　　　　　 string path = filepath.Substring( 0,filepath.LastIndexOf( "" ) );
　　　　　　　　　　　　　　　 if (!Directory.Exists(path))
　　　　　　　　　　　　　　　 {
　　　　　　　　　　　　　　　　　　　 CreateDir( path );
　　　　　　　　　　　　　　　 } 　　　　　　　　　　　　　　
　　　　　　　　　　　　　　　 picB.Image.Save (filepath);　　　　　　　　　　　　　　　
　　　　　　　　　　　 }
　　　　　　　　　　　 catch(Exception exp)
　　　　　　　　　　　 {
　　　　　　　　　　　　　　　 string ss = exp.Message;
　　　　　　　 WriteLog( filepath.Substring(0,filepath.LastIndexOf("")) + "error.log",a_strUrl + "--" + ss + "
");　　　　　　　
　　　　　　　　　　　 }
　　　　　　　 }
　　保存文件或入庫
　　上面取得的信息可以按自己的要求保存。
　　****設計的時候沒有使用url按層次循 環抓取，這樣定義抓取url效率更高，速度更快。
　　注：此版本只提供靜態文件存儲功能，不提供數據庫接口，不提供自定義網站功能。
　　本程序運行需要先安 裝.net 框架1.1。

View Code

c# 抓取網頁類（獲取網頁中所有信息）

c# 抓取網頁類（獲取網頁中所有信息）
分類： c#程序設計2011-08-05 09:14 2362人閱讀 評論(4) 收藏 舉報
 
[csharp] view plaincopyprint?
1. using System;  
2. using System.Data;  
3. using System.Configuration;  
4. using System.Net;  
5. using System.IO;  
6. using System.Text;  
7. using System.Collections.Generic;  
8. using System.Text.RegularExpressions;  
9. using System.Threading;  
10. using System.Web;  
11. using System.Web.UI.MobileControls;  
12.     /// <summary>  
13.     /// 網頁類  
14.     /// </summary>  
15.     public class WebPage  
16.     {  
17.         #region 私有成員  
18.         private Uri m_uri;   //url  
19.         private List<Link> m_links;    //此網頁上的鏈接  
20.         private string m_title;        //標題  
21.         private string m_html;         //HTML代碼  
22.         private string m_outstr;       //網頁可輸出的純文本  
23.         private bool m_good;           //網頁是否可用  
24.         private int m_pagesize;       //網頁的大小  
25.         private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有網頁的Cookie  
26.         
27.         #endregion  
28.  
29.         #region 屬性  
30.   
31.         /// <summary>  
32.         /// 通過此屬性可獲得本網頁的網址，只讀  
33.         /// </summary>  
34.         public string URL  
35.         {  
36.             get  
37.             {  
38.                 return m_uri.AbsoluteUri;  
39.             }  
40.         }  
41.   
42.         /// <summary>  
43.         /// 通過此屬性可獲得本網頁的標題，只讀  
44.         /// </summary>  
45.         public string Title  
46.         {  
47.             get  
48.             {  
49.                 if (m_title == "")  
50.                 {  
51.                     Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
52.                     Match mc = reg.Match(m_html);  
53.                     if (mc.Success)  
54.                         m_title = mc.Groups["title"].Value.Trim();  
55.                 }  
56.                 return m_title;  
57.             }  
58.         }  
59.         public string M_html  
60.         {  
61.             get  
62.             {  
63.                 if (m_html == null)  
64.                 {  
65.                     m_html = "";  
66.                 }  
67.                 return m_html;  
68.             }  
69.         }  
70.         /// <summary>  
71.         /// 此屬性獲得本網頁的所有鏈接信息，只讀  
72.         /// </summary>  
73.         public List<Link> Links  
74.         {  
75.             get  
76.             {  
77.                 if (m_links.Count == 0) getLinks();  
78.                 return m_links;  
79.             }  
80.         }  
81.   
82.   
83.         /// <summary>  
84.         /// 此屬性返回本網頁的全部純文本信息，只讀  
85.         /// </summary>  
86.         public string Context  
87.         {  
88.             get  
89.             {  
90.                 if (m_outstr == "") getContext(Int16.MaxValue);  
91.                 return m_outstr;  
92.             }  
93.         }  
94.   
95.         /// <summary>  
96.         /// 此屬性獲得本網頁的大小  
97.         /// </summary>  
98.         public int PageSize  
99.         {  
100.             get  
101.             {  
102.                 return m_pagesize;  
103.             }  
104.         }  
105.         /// <summary>  
106.         /// 此屬性獲得本網頁的所有站內鏈接  
107.         /// </summary>  
108.         public List<Link> InsiteLinks  
109.         {  
110.             get  
111.             {  
112.                 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);  
113.             }  
114.         }  
115.   
116.         /// <summary>  
117.         /// 此屬性表示本網頁是否可用  
118.         /// </summary>  
119.         public bool IsGood  
120.         {  
121.             get  
122.             {  
123.                 return m_good;  
124.             }  
125.         }  
126.         /// <summary>  
127.         /// 此屬性表示網頁的所在的網站  
128.         /// </summary>  
129.         public string Host  
130.         {  
131.             get  
132.             {  
133.                 return m_uri.Host;  
134.             }  
135.         }  
136.         #endregion  
137.   
138.   
139.         /// <summary>  
140.         /// 從HTML代碼中分析出鏈接信息  
141.         /// </summary>  
142.         /// <returns>List<Link></returns>  
143.         private List<Link> getLinks()  
144.         {  
145.             if (m_links.Count == 0)  
146.             {  
147.                 Regex[] regex = new Regex[2];  
148.                 regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);  
149.                 regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.IgnoreCase);  
150.   
151.                 for (int i = 0; i < 2; i++)  
152.                 {  
153.                     Match match = regex[i].Match(m_html);  
154.                     while (match.Success)  
155.                     {  
156.                         try  
157.                         {  
158.                             string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);  
159.   
160.                             string text = "";  
161.                             if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");  
162.   
163.                             Link link = new Link();  
164.                             link.Text = text;  
165.                             link.NavigateUrl = url;  
166.   
167.                             m_links.Add(link);  
168.                         }  
169.                         catch (Exception ex) { Console.WriteLine(ex.Message); };  
170.                         match = match.NextMatch();  
171.                     }  
172.                 }  
173.             }  
174.             return m_links;  
175.         }  
176.         /// <summary>  
177.         /// 此私有方法從一段HTML文本中提取出一定字數的純文本  
178.         /// </summary>  
179.         /// <param name="instr">HTML代碼</param>  
180.         /// <param name="firstN">提取從頭數多少個字</param>  
181.         /// <param name="withLink">是否要鏈接里面的字</param>  
182.         /// <returns>純文本</returns>  
183.         private string getFirstNchar(string instr, int firstN, bool withLink)  
184.         {  
185.             if (m_outstr == "")  
186.             {  
187.                 m_outstr = instr.Clone() as string;  
188.                 m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
189.                 m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
190.                 m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
191.                 if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
192.                 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
193.                 m_outstr = objReg.Replace(m_outstr, "");  
194.                 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
195.                 m_outstr = objReg2.Replace(m_outstr, " ");  
196.   
197.             }  
198.             return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;  
199.         }  
200.  
201.  
202.         #region 公有文法  
203.         /// <summary>  
204.         /// 此公有方法提取網頁中一定字數的純文本，包括鏈接文字  
205.         /// </summary>  
206.         /// <param name="firstN">字數</param>  
207.         /// <returns></returns>  
208.         public string getContext(int firstN)  
209.         {  
210.             return getFirstNchar(m_html, firstN, true);  
211.         }  
212.   
213.         /// <summary>  
214.         /// 此公有方法從本網頁的鏈接中提取一定數量的鏈接，該鏈接的URL滿足某正則式  
215.         /// </summary>  
216.         /// <param name="pattern">正則式</param>  
217.         /// <param name="count">返回的鏈接的個數</param>  
218.         /// <returns>List<Link></returns>  
219.         public List<Link> getSpecialLinksByUrl(string pattern, int count)  
220.         {  
221.             if (m_links.Count == 0) getLinks();  
222.             List<Link> SpecialLinks = new List<Link>();  
223.             List<Link>.Enumerator i;  
224.             i = m_links.GetEnumerator();  
225.             int cnt = 0;  
226.             while (i.MoveNext() && cnt < count)  
227.             {  
228.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)  
229.                 {  
230.                     SpecialLinks.Add(i.Current);  
231.                     cnt++;  
232.                 }  
233.             }  
234.             return SpecialLinks;  
235.         }  
236.   
237.         /// <summary>  
238.         /// 此公有方法從本網頁的鏈接中提取一定數量的鏈接，該鏈接的文字滿足某正則式  
239.         /// </summary>  
240.         /// <param name="pattern">正則式</param>  
241.         /// <param name="count">返回的鏈接的個數</param>  
242.         /// <returns>List<Link></returns>  
243.         public List<Link> getSpecialLinksByText(string pattern, int count)  
244.         {  
245.             if (m_links.Count == 0) getLinks();  
246.             List<Link> SpecialLinks = new List<Link>();  
247.             List<Link>.Enumerator i;  
248.             i = m_links.GetEnumerator();  
249.             int cnt = 0;  
250.             while (i.MoveNext() && cnt < count)  
251.             {  
252.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success)  
253.                 {  
254.                     SpecialLinks.Add(i.Current);  
255.                     cnt++;  
256.                 }  
257.             }  
258.             return SpecialLinks;  
259.         }  
260.   
261.         /// <summary>  
262.         /// 這公有方法提取本網頁的純文本中滿足某正則式的文字  
263.         /// </summary>  
264.         /// <param name="pattern">正則式</param>  
265.         /// <returns>返回文字</returns>  
266.         public string getSpecialWords(string pattern)  
267.         {  
268.             if (m_outstr == "") getContext(Int16.MaxValue);  
269.             Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);  
270.             Match mc = regex.Match(m_outstr);  
271.             if (mc.Success)  
272.                 return mc.Groups[1].Value;  
273.             return string.Empty;  
274.         }  
275.         #endregion  
276.  
277.         #region 構造函數  
278.   
279.         private void Init(string _url)  
280.         {  
281.             try  
282.             {  
283.                 m_uri = new Uri(_url);  
284.                 m_links = new List<Link>();  
285.                 m_html = "";  
286.                 m_outstr = "";  
287.                 m_title = "";  
288.                 m_good = true;  
289.                 if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))  
290.                 {  
291.                     m_good = false;  
292.                     return;  
293.                 }  
294.                 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);  
295.                 rqst.AllowAutoRedirect = true;  
296.                 rqst.MaximumAutomaticRedirections = 3;  
297.                 rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";  
298.                 rqst.KeepAlive = true;  
299.                 rqst.Timeout = 10000;  
300.                 lock (WebPage.webcookies)  
301.                 {  
302.                     if (WebPage.webcookies.ContainsKey(m_uri.Host))  
303.                         rqst.CookieContainer = WebPage.webcookies[m_uri.Host];  
304.                     else  
305.                     {  
306.                         CookieContainer cc = new CookieContainer();  
307.                         WebPage.webcookies[m_uri.Host] = cc;  
308.                         rqst.CookieContainer = cc;  
309.                     }  
310.                 }  
311.                 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();  
312.                 Stream sm = rsps.GetResponseStream();  
313.                 if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)  
314.                 {  
315.                     rsps.Close();  
316.                     m_good = false;  
317.                     return;  
318.                 }  
319.                 Encoding cding = System.Text.Encoding.Default;  
320.                 string contenttype = rsps.ContentType.ToLower();  
321.                 int ix = contenttype.IndexOf("charset=");  
322.                 if (ix != -1)  
323.                 {  
324.                     try  
325.                     {  
326.                         cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));  
327.                     }  
328.                     catch  
329.                     {  
330.                         cding = Encoding.Default;  
331.                     }  
332.                      
333.                     //該處視情況而定 有的需要解碼  
334.                     //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
335.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
336.                       
337.                 }  
338.                 else  
339.                 {  
340.                   //該處視情況而定 有的需要解碼  
341.                    //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
342.                       
343.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
344.                     Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", RegexOptions.IgnoreCase);  
345.                     string strcding = regex.Match(m_html).Groups["cding"].Value;  
346.                     try  
347.                     {  
348.                         cding = Encoding.GetEncoding(strcding);  
349.                     }  
350.                     catch  
351.                     {  
352.                         cding = Encoding.Default;  
353.                     }  
354.                     byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray());  
355.                     m_html = cding.GetString(bytes);  
356.                     if (m_html.Split('?').Length > 100)  
357.                     {  
358.                         m_html = Encoding.Default.GetString(bytes);  
359.                     }  
360.                 }  
361.                 m_pagesize = m_html.Length;  
362.                 m_uri = rsps.ResponseUri;  
363.                 rsps.Close();  
364.             }  
365.             catch (Exception ex)  
366.             {  
367.                  
368.             }  
369.         }  
370.         public WebPage(string _url)  
371.         {  
372.             string uurl = "";  
373.             try  
374.             {  
375.                 uurl = Uri.UnescapeDataString(_url);  
376.                 _url = uurl;  
377.             }  
378.             catch { };  
379.             Init(_url);  
380.         }  
381.         #endregion  
382.     }

View Code

得到一個完整的文件列表中使用ftprequest緩慢

我想得到的文件名，文件的大小和最后修改時間每個文件服務器上，然后在一個完整的它。

它真的很好，直到我切換主機，現在真的很緩慢，盡管新的主機是一樣快，在客戶端。

沒有任何明顯的理由為何？

此外，它是非常必要把登錄憑據，每一次？

我使用的第一種方法得到一個字符串數組，然后遍歷並使用另一個在每個項目得到文件的大小：
public static string[] GetFileList()
    {
        string[] downloadFiles;
        StringBuilder result = new StringBuilder();
        FtpWebRequest request;
        try
        {
            request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/"));
            request.UseBinary = true;
            request.Credentials = new NetworkCredential(settings.Username, settings.Password);
            request.Method = WebRequestMethods.Ftp.ListDirectory;
            request.UseBinary = true;

            WebResponse response = request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream());

            string line = reader.ReadLine();
            while (line != null)
            {
                result.Append(line);
                result.Append("\n");
                line = reader.ReadLine();
            }
            // to remove the trailing '\n'
            result.Remove(result.ToString().LastIndexOf('\n'), 1);
            reader.Close();
            response.Close();
            return result.ToString().Split('\n');
        }
        catch (Exception ex)
        {
            System.Windows.Forms.MessageBox.Show(ex.Message);
            downloadFiles = null;
            return downloadFiles;
        }
    }

    public static int GetFileSize(string file)
    {
        //MessageBox.Show("getting filesize...");

        StringBuilder result = new StringBuilder();
        FtpWebRequest request;
        try
        {
            request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/" + file));
            request.UseBinary = true;
            request.Credentials = new NetworkCredential(settings.Username, settings.Password);
            request.Method = WebRequestMethods.Ftp.GetFileSize;

            int dataLength = (int)request.GetResponse().ContentLength;

            return dataLength;
        }
        catch (Exception ex)
        {
            //System.Windows.Forms.MessageBox.Show(ex.Message);
            return 1337;
        }
    }

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 抓取網頁生成 PDF MFC抓取網頁代碼。怎樣抓取網頁內容利用 HtmlAgilityPack 抓取網頁動態抓取網頁信息實用網頁抓取 python抓取網頁圖片 Python 抓取網頁tag操作 Fiddler抓取微信網頁 Golang: 抓取網頁內容