采集58信息的一些總結


一個朋友問我能不能幫他做個小程序。抓取58上面包含"維修"的數據,比如公司名稱,電話號碼等等

 

打開58,收索"維修"

 

 

 

 

單擊 房屋維修,進入一個列表頁面,

 

隨便單擊一個,進入詳細頁面

 

 需要請求58服務器3次。然后匹配html元素獲取自己需要的信息,數據匹配自然少不了正則表達式,用過的都知道,

對於我來說,寫正則表達式是非常頭疼的事情,所以可以選擇第三方庫:比如HtmlAgilityPack,Jumony等等,我這里選擇的是Jumony

博客園有對Jumony入門的文章: http://www.cnblogs.com/Ivony/archive/2010/12/19/jumony-guide-1.html

 

jumony直接安裝在項目中:

首先:選擇需要添加的項目,單擊引用,然后選擇管理NuGet程序包,在必要的情況下,需要升級NuGet

   

其次:收索Jumony安裝即可

 

 

 

 

先看看我實現的效果圖:因為公司比較忙,只能晚上回家寫寫,問題也是非常多。所有先記錄這兩天實現的效果

 

 

 

 

************************效果圖結束*****************************

 

 

 

主窗體:左側顯示的是在首頁匹配后的關鍵字。然后通過多線程月抓取每個列表頁面的信息。

 

看看我主窗體的布局

 

 

顯示數據的DatatGridView是動態創建的。

來看看核心代碼:模擬請求58服務器,就要去觀察58的請求與響應,可以通過Fiddler2和Firebug抓包觀察

 

 

 

我根據我項目的需求封裝了一個HttpWebHelper類,

 

  1  /// <summary>
  2     /// 封裝Http類
  3     /// </summary>
  4     class HttpWebHelper
  5     {
  6         /// <summary>
  7         /// 顯示驗證碼頁面容器
  8         /// </summary>
  9         public static WebBrowser webBrowser { get; set; }
 10 
 11         /// <summary>
 12         /// 驗證碼需要的唯一id
 13         /// </summary>
 14         public static string uuid { get; set; }
 15 
 16         /// <summary>
 17         /// 驗證碼是否通過
 18         /// </summary>
 19         public static bool isPass { get; set; }
 20 
 21         /// <summary>
 22         /// 首頁關鍵字對應的url
 23         /// </summary>
 24         public static Dictionary<string, string> list;
 25 
 26         /// <summary>
 27         /// 抓取頁面前綴
 28         /// </summary>
 29         public static string prefix;
 30         /// <summary>
 31         /// 顯示驗證碼頁面
 32         /// </summary>
 33         public string codeUrl { get; set; }
 34         /// <summary>
 35         /// 抓取首頁
 36         /// </summary>
 37         public string dataUrl { get; set; }
 38         /// <summary>
 39         /// 驗證碼提交頁面
 40         /// </summary>
 41         public static string verCode { get; set; }
 42 
 43         /// <summary>
 44         /// 頁面請求方式
 45         /// </summary>
 46         public string Method { get; set; }
 47         /// <summary>
 48         /// RefererHTTP 表頭值
 49         /// </summary>
 50         public string Referer { get; set; }
 51         /// <summary>
 52         /// 主機
 53         /// </summary>
 54         public string Host { get; set; }
 55         /// <summary>
 56         /// cookie
 57         /// </summary>
 58         public CookieContainer cookie { get; set; }
 59         /// <summary>
 60         /// 
 61         /// </summary>
 62         public string Accept { get; set; }
 63         public string UserAgent { get; set; }
 64         public string ContentType { get; set; }
 65         public string Accept_Language { get; set; }
 66         public Encoding encoding { get; set; }
 67 
 68         public Image PictureBox { get; set; }
 69 
 70 
 71 
 72 
 73         public HttpWebHelper()
 74         {
 75             this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do";
 76             //this.verCode = "http://support.58.com/firewall/valid/3071088800.do";
 77 
 78             Method = "post";
 79             Referer = "http://support.58.com/firewall/valid/3071088800.do";
 80             Host = "support.58.com";
 81             Accept_Language = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3";
 82             Accept = "*/*";
 83             ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
 84             UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0";
 85             encoding = Encoding.UTF8;
 86 
 87 
 88         }
 89 
 90         public HttpWebHelper(WebBrowser webBrowser, string uuid, string dataUrl)
 91         {
 92             //this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do";
 93             //HttpWebHelper.webBrowser = webBrowser;
 94             //this.uuid = uuid;
 95             this.dataUrl = dataUrl;
 96         }
 97 
 98         /// <summary>
 99         /// 驗證 驗證碼,驗證碼和頁面生成的一個id值同時post到服務器
100         /// </summary>
101         /// <param name="code">驗證碼</param>
102         public void postVerCode(string code, string uuid)
103         {
104             try
105             {
106                 //HtmlElement d = webBrowser.Document.GetElementById("uuid");
107 
108                 //獲取頁面uid。
109                 /*
110                  * 驗證方式:驗證碼和頁面生成的一個id值
111                  */
112                 //string y = webBrowser.Document.GetElementById("uuid").GetAttribute("value");
113 
114                 // string postUrl = "http://support.58.com/firewall/valid/3071088800.do";
115                 HttpWebHelper h = new HttpWebHelper();
116 
117                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(verCode);
118                 request.Method = Method;
119                 request.Referer = Referer;
120                 request.Headers.Add("X-Requested-With", "XMLHttpRequest");
121                 request.Host = Host;
122                 CookieContainer cookie = new CookieContainer();
123                 request.CookieContainer = cookie;
124                 request.Accept = Accept;
125                 request.ContentType = ContentType;
126                 request.Headers.Add("Accept-Language", Accept_Language);
127                 request.UserAgent = UserAgent;
128                 string parameter = string.Format("inputcode={0}&namespace=infodetailweb&uuid={1}", HttpUtility.UrlEncode(code), uuid);
129 
130                 byte[] buffer = Encoding.Default.GetBytes(parameter);
131 
132                 string result = string.Empty;
133                 Stream reqStr = request.GetRequestStream();
134                 reqStr.Write(buffer, 0, buffer.Length);
135                 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse())
136                 {
137 
138                     using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding))
139                     {
140                         result = reader.ReadToEnd().Trim();
141                     }
142                 }
143                 HttpWebHelper.isPass = (result == "1" ? true : false);
144             }
145             catch (Exception ex)
146             {
147                 MessageBox.Show(ex.StackTrace);
148             }
149         }
150 
151         /// <summary>
152         /// WebClient簡單下載頁面
153         /// </summary>
154         /// <param name="url">下載html的頁面</param>
155         /// <returns></returns>
156         public string webClient(string url)
157         {
158             string html = string.Empty;
159             try
160             {
161                 //WebClient client = new WebClient();
162                 //client.Encoding = encoding;
163                 //string html = client.DownloadString(url);
164                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
165 
166                 request.Method = "get";
167                 //request.Timeout = 300;
168                 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse())
169                 {
170                     using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding))
171                     {
172                         html = reader.ReadToEnd().Trim();
173                     }
174                 }
175             }
176             catch (Exception ex)
177             {
178                 MessageBox.Show(ex.StackTrace);
179             }
180             return html;
181         }
182     }

 

 

模擬請求類有了。接下來就是在返回的htlm中抓取關鍵字,這里是匹配包含 "維修" 的a 標簽

我封裝了一個方法,根據url和關鍵字抓取數據后。直接給窗體的控件listBoxMenu綁定數據

 

 /// <summary>
        /// 
        /// </summary>
        /// <param name="url">首頁抓取</param>
        /// <param name="keyword">首頁關鍵字</param>
        private void ProcessDownload(string url, string keyword)
        {
            this.Invoke(
                         new Action(() => { richTextBoxInfo.AppendText(url + "開始下載中......\n"); })
                        );

            //抓取關鍵字對應的url
            WebClient client = new WebClient();
            string html = client.DownloadString(url);
            IHtmlDocument document = new JumonyParser().Parse(html);
            IEnumerable<IHtmlElement> result = document.Find("a").Where(t => t.InnerText().Contains(keyword));

            Dictionary<string, string> dir = new Dictionary<string, string>();
            foreach (var item in result)
            {
                var href = item.Attribute("href").Value();
                var text = item.InnerText();
                if (!dir.ContainsKey(href)) dir.Add(text, href);
            }

            //左邊菜單欄賦值
            this.Invoke(new Action(() =>
            {
                foreach (var item in dir)
                {
                    listBoxMenu.Items.Add(item.Key);
                }
            }));

            //共享數據
            HttpWebHelper.list = dir;
            HttpWebHelper.prefix = url;

            //開啟多線程下載。

            //foreach (var item in dir)
            //{
            //    Thread thread = new Thread(() => { DownloadHtml(item.Key); });
            //    thread.Name = item.Key; //線程取名字
            //}

            try
            {
                foreach (var item in dir)
                {
                    //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key);

                    Thread thread = new Thread(ThreadDownload);
                    thread.Name = item.Key;
                    thread.Start(item.Key + "," + item.Value);

                }
            }
            catch (Exception ex)
            {

                MessageBox.Show(ex.StackTrace);
            }
        }

 

這個void ProcessDownload(string url, string keyword)有幾點注意。這個方法是異步調用的。所以在這里給窗體的控件賦值,就屬於跨線程操作UI,因為UI是在主線程中創建和繪制的

有關跨線程問題可以看此篇博文:http://www.cnblogs.com/nsky/p/4436309.html

可以看到里面是有用到線程池的 :ThreadPool,后來被我注釋了。因為我需要給線程命名。但線程池我沒找到此方法。是不是沒有呢?

在ProcessDownload方法里面。當首頁關鍵字匹配后,根據匹配的個數,開啟多線程執行詳細頁面抓取,首頁的關鍵字我保存在了字典里面

 Dictionary<string, string> dir = new Dictionary<string, string>(); 分別用關鍵字和關鍵字對應的url來存取key-value。在HttpWebHelper類中。我也定義了static

try
            {
                foreach (var item in dir)
                {
                    //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key);

                    Thread thread = new Thread(ThreadDownload);
                    thread.Name = item.Key;
                    thread.Start(item.Key + "," + item.Value);

                }
            }
            catch (Exception ex)
            {

                MessageBox.Show(ex.StackTrace);
            }

 這里把key-value傳值給ThreadDownload。

了解多線程可以看博文:http://www.cnblogs.com/nsky/p/4425286.html

 

 

首頁抓取關鍵字的方法有了。那還缺一個什么方法?還需要一個抓取顯示列表的頁面,這里取名為:ThreadDownload方法

  1 /// <summary>
  2         /// 
  3         /// </summary>
  4         /// <param name="title">當前抓取的關鍵字</param>
  5         private void ThreadDownload(object obj)
  6         {
  7             //因為58有采集頻率限制。所以改成同步
  8             Monitor.Enter(this);
  9 
 10             string[] ob = obj.ToString().Split(',');
 11             this.Invoke(
 12                          new Action(() => { richTextBoxInfo.AppendText(string.Format("正在抓取:{0}\n", ob[0])); })
 13                      );
 14             Dictionary<string, string> list = HttpWebHelper.list;
 15             string prefix = HttpWebHelper.prefix;
 16 
 17 
 18             HttpWebHelper client = new HttpWebHelper();
 19             client.encoding = Encoding.UTF8;
 20             //client.webClient(prefix);
 21 
 22 
 23             DataTable dt = new DataTable();
 24             dt.Columns.Add("公司名字", typeof(string));
 25             dt.Columns.Add("聯系人", typeof(string));
 26             dt.Columns.Add("聯系電話", typeof(string));
 27 
 28             //遍歷每個信息對象的url 如:家庭維修==》 www.baidu.com
 29             //foreach (var item in list)
 30             //{
 31             //獲取列表
 32             string fullurl = string.Format("{0}{1}", prefix, ob[1]);
 33             string html = client.webClient(fullurl);
 34 
 35             IHtmlDocument document = new JumonyParser().Parse(html);
 36             IEnumerable<IHtmlElement> result = document.Find("table[id=jingzhun]");
 37 
 38             var items = result.Find("tr");
 39 
 40             foreach (var o in items)
 41             {
 42                 if (o.Find("a").Count() > 0)
 43                 {
 44                     /*
 45                      * 執行該url的時候。服務器判斷了請求的頻繁度,需要輸入驗證碼。
 46                      * 輸入驗證碼成功后。會執行該url  即下面的referer
 47                      */
 48                     //列表中找到a標簽轉到詳細頁面
 49                     string referer = o.FindFirst("a").Attribute("href").Value();
 50 
 51 
 52                     //http://support.58.com/firewall/valid/1032910901.do?namespace=infodetailweb&url=http://sz.58.com/qichejx/19720429696131x.shtml
 53 
 54                     //等待5秒,防止抓取頻率過高 時間根據當前的環境來定
 55                     Thread.Sleep(5000);
 56 
 57 
 58 
 59                     string n = Thread.CurrentThread.Name;
 60                     string i = Thread.CurrentThread.ManagedThreadId.ToString();
 61 
 62                     //抓取詳細頁面。這里如果過於頻繁,會跳到輸入驗證碼頁面
 63                     string sonHtml = client.webClient(referer);
 64 
 65                     //Monitor.Enter(this);
 66 
 67                     if (sonHtml.Contains("驗證碼"))
 68                     {
 69 
 70                         HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer);
 71                         request.Method = "get";
 72                         string responseUrl = string.Empty;
 73                         string rediect = string.Empty;
 74                         using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse())
 75                         {
 76                             //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml"
 77                             responseUrl = response1.ResponseUri.ToString();
 78 
 79                             //獲取絕對路徑 "/firewall/valid/1032910901.do"
 80                             string absolutePath = response1.ResponseUri.AbsolutePath;
 81 
 82                             //ResponseUri.Authority  "support.58.com"
 83                             HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath;
 84 
 85                             //獲取?后面的字符串
 86                             string query = response1.ResponseUri.Query;
 87 
 88                             //驗證碼成功后,重定向的url
 89                             rediect = query.Substring(query.LastIndexOf("=") + 1);
 90                         }
 91                         //response1.ResponseUri.GetComponents(UriComponents.Query, UriFormat.UriEscaped);
 92                         //HttpWebHelper http = new HttpWebHelper();
 93                         //HttpWebHelper.webBrowser = new WebBrowser();
 94                         //HttpWebHelper.webBrowser.Url = new Uri(http.codeUrl);
 95 
 96                         //http.webBrowser.Navigate(http.codeUrl);
 97                         //HttpWebHelper.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);
 98                         //HttpWebHelper.webBrowser.NewWindow += new CancelEventHandler(webBrowser_NewWindow);
 99                         //http://blog.csdn.net/jinjazz/article/details/1916883
100                         //while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); }
101 
102                         //Thread thread = new Thread(() =>
103                         //{
104                         //    showCode code = new showCode();
105                         //    code.codeHandler = new HttpWebHelper().postVerCode;
106                         //    //code.p = h.PictureBox;
107                         //    if (code.ShowDialog() == DialogResult.OK)
108                         //    {
109                         //        code.Hide();
110                         //    }
111                         //});
112 
113                         this.Invoke(new Action(() =>
114                         {
115 
116                             showCode code = new showCode();
117                             code.codeHandler = new HttpWebHelper().postVerCode;
118                             code.showCodeUrl = responseUrl;
119                             //code.p = h.PictureBox;
120                             //this.dia
121                             if (code.ShowDialog() == DialogResult.OK)
122                             {
123                                 code.Hide();
124                                 if (HttpWebHelper.isPass)
125                                 {
126                                     sonHtml = client.webClient(rediect);
127 
128                                     getTable(sonHtml, ref dt);
129                                 }
130                             }
131                             //waitHandle.Set();
132 
133                             //waitHandle.WaitOne();
134                         }));
135                         //waitHandle.WaitOne();
136                     }
137                     else
138                         getTable(sonHtml, ref dt);
139 
140                     //獲取當前線程
141                     Thread th = Thread.CurrentThread;
142                     string name = th.Name;
143 
144                     this.Invoke(new Action(() =>
145                     {
146                         //MessageBox.Show(name.ToString());
147 
148 
149                         //創建tab選項卡,如果不存在
150                         if (!tabControlWarp.TabPages.ContainsKey(name))
151                             tabControlWarp.TabPages.Add(name, name);
152 
153                         //動態創建選項卡中顯示的數據,和一些屬性設置
154                         DataGridView view = new DataGridView();
155                         view.AllowUserToAddRows = false;
156                         view.AllowUserToDeleteRows = false;
157                         view.AllowUserToResizeColumns = false;
158                         view.AllowUserToResizeRows = false;
159                         view.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill;
160                         view.ColumnHeadersHeightSizeMode = DataGridViewColumnHeadersHeightSizeMode.AutoSize;
161                         view.MultiSelect = false;
162                         view.ReadOnly = true;
163                         view.RowHeadersVisible = false;
164                         view.BackgroundColor = Color.White;
165                         view.ScrollBars = ScrollBars.Vertical;
166                         view.SelectionMode = DataGridViewSelectionMode.FullRowSelect;
167                         view.Dock = DockStyle.Fill;
168                         view.DataSource = dt;
169                         //把DataGridView添加到當前選項卡
170                         tabControlWarp.TabPages[name].Controls.Add(view);
171 
172                         //刷新窗體,否則DataGridView數據沒有變化
173                         this.Refresh();
174                     }));
175                 }
176             }
177             //當前線程執行完畢,把當前的數據導出為excel
178             ExcelRender.ExcelRender.RenderToExcel(dt, ob[0] + ".xls");
179             Monitor.Exit(this);
180         }

 

 

 這個地方有一個難點就是,如果你采集的頻率過高,58會跳轉到一個驗證碼登錄頁面。這里本來是用多線程執行異步任務,

但:比如同時在執行采集 "手機維修"和"電腦維修"的時候。只要"手機維修"遇到驗證碼的時候,顯然"電腦維修"也會遇到。會有很多不確定的因素,

因為是多線程異步操作,當我彈窗讓用戶輸入驗證碼的代碼,同樣會執行多次。

所以找了采取了線程同步 。我用了 Monitor.Enter(this);實現同步。當然你可以用更簡單的lock關鍵字可以實現同樣的效果。

 

 

 

說到驗證碼。58算是下了大功夫,都知道58信息量的巨大。采集的人肯定多。58驗證碼的機制是。當跳轉到驗證碼登錄頁面,

頁面會生成唯一一個uuid,和一個驗證碼post到服務器的url和顯示驗證碼有相關聯的信息,下面會說明

從圖片中可以看出來,顯示驗證碼中的url和post到服務器中的url都包含 1032910901。這是重點,當你提交驗證碼的時候,服務器會驗證 這個 數字 和uuid如果不匹配則驗證錯誤。

你要記住:這個數字和uuid每次都是不同的。

 

那我這里是怎么顯示驗證碼的呢?

 首先我是用最普通也是最大眾的方式。

用HttpWebRequest讀取,其實當HttpWebRequest讀取的時候,服務器的驗證碼已經變了。

當跳轉到驗證碼登錄頁面。服務器就已經記住了uuid,url中的數字 和驗證碼,當你用HttpWebRequest去獲取驗證碼肯定

和之前的驗證碼不同。

除了這種方式,網上也提到了好幾種方式,這里驗證成功后,有一個回調方法

可以通過HttpWebResponse獲取響應請求的url。比如

 1                         HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer);
 2                         request.Method = "get";
 3                         string responseUrl = string.Empty;
 4                         string rediect = string.Empty;
 5                         using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse())
 6                         {
 7                             //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml"
 8                             responseUrl = response1.ResponseUri.ToString();
 9 
10                             //獲取絕對路徑 "/firewall/valid/1032910901.do"
11                             string absolutePath = response1.ResponseUri.AbsolutePath;
12 
13                             //ResponseUri.Authority  "support.58.com"
14                             HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath;
15 
16                             //獲取?后面的字符串
17                             string query = response1.ResponseUri.Query;
18 
19                             //驗證碼成功后,重定向的url
20                             rediect = query.Substring(query.LastIndexOf("=") + 1);
21                         }

 

 

第一種:頁面在WebBrowser中打開。讀取驗證碼圖片流。保存在剪切板中

 

 1 /// <summary>
 2         /// 返回指定WebBrowser中圖片<IMG></IMG>中的圖內容
 3         /// </summary>
 4         /// <param name="WebCtl">WebBrowser控件</param>
 5         /// <param name="ImgeTag">IMG元素</param>
 6         /// <returns>IMG對象</returns>
 7         private Image GetWebImage(WebBrowser WebCtl, HtmlElement ImgeTag)
 8         {
 9 
10             /*
11              * 這種方法有時候會因為剪切板沒有頭像而報異常,
12              * 初步判斷是頁面(我這里是js對圖片賦值)圖片沒有加載完成,而沒獲取到圖片
13              * System.Threading.Thread.Sleep(8000);測試通過。但每次時間是不確定的。
14              */
15 
16             HTMLDocument doc = (HTMLDocument)WebCtl.Document.DomDocument;
17             HTMLBody body = (HTMLBody)doc.body;
18             IHTMLControlRange rang = (IHTMLControlRange)body.createControlRange();
19             IHTMLControlElement Img = (IHTMLControlElement)ImgeTag.DomElement; //圖片地址
20             Image oldImage = Clipboard.GetImage();
21             rang.add(Img);
22             rang.execCommand("Copy", false, null);  //拷貝到內存
23             Image numImage = Clipboard.GetImage(); //如果為null則保存
24 
25             //判斷剪切板是否有圖片 
26             //https://msdn.microsoft.com/zh-cn/library/system.windows.forms.clipboard.getimage.aspx
27             if (Clipboard.ContainsImage())
28             { }
29 
30 
31             try
32             {
33                 Clipboard.SetImage(oldImage);
34             }
35             catch (Exception ex)
36             {
37                 MessageBox.Show(ex.Message);
38             }
39             return numImage;
40         }

 

 調用代碼:

1            //找到圖片
2             HtmlElement ImgeTag = webBrowser1.Document.GetElementById("imgCode");
3             
4             Image numPic = GetWebImage(webBrowser1, ImgeTag); // 得到驗證碼圖片
5             pictureBox1.Image = numPic; //圖片賦值

 

 

HTMLDocument需要添加引用:F:\Program Files (x86)\Microsoft Visual Studio 12.0\Visual Studio Tools for Office\PIA\Common\Microsoft.mshtml.dll

引入命名空間:using mshtml;

 

顯然。頁面必須加載完成后才能獲取到圖片。即在事件中webBrowser1_DocumentCompleted獲取。但它卻不能判斷js腳本什么時候完成。

如果是多線程異步任務,還需要webBrowser1_DocumentCompleted執行后,在執行后面的方法,因為webBrowser1_DocumentCompleted本身就是異步的

此時的解決方案是 利用AutoResetEvent阻止線程,等當前線程執行完畢

 AutoResetEvent waitHandle = new AutoResetEvent(false);
 while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); }

 

 

 

第二種:抓圖。根據圖片的高寬來剪切

首先動態創建WebBrowser,並注冊事件

 WebBrowser we = new WebBrowser();
            we.Url = new Uri("http://support.58.com/firewall/valid/3071088800.do");
            we.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(we_DocumentCompleted);

 

 

 1 void we_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
 2         {
 3      
 4             //HtmlElement d = webBrowser1.Document.GetElementById("uuid");
 5 
 6             //string y = webBrowser1.Document.GetElementById("uuid").GetAttribute("value");
 7 
 8 
 9 
10             //var wb = new WebBrowser();
11 
12             HtmlElementCollection docs = we.Document.All;
13             foreach (HtmlElement item in docs)
14             {
15                 string ii = item.Id;
16 
17                 if (item.Id == "uuid")
18                 {
19                     string c = item.GetAttribute("value");
20                 }
21                 else if (item.Id == "imgCode")
22                 {
23                     HtmlElement img = item.Document.GetElementById("imgCode");
24                     item.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px";
25 
26                     //抓圖
27                     var b = new Bitmap(item.ClientRectangle.Width, item.ClientRectangle.Height);
28                     we.DrawToBitmap(b, new Rectangle(new Point(), item.ClientRectangle.Size));
29                     pictureBox1.Image = b;
30                     break;
31 
32                 }
33             }
34         }

 

 

第二種有個注意的地方:WebBrowser必須動態創建但不能依附於窗體上,即不將WebBrowser加載到窗體,否則截取后的圖片是顯示白色的。我也不知道什么原因

第3種:是根據第二種演化而來的,也是我當前用的。感覺有些投機取巧

你可以到顯示驗證碼頁面查看驗證碼圖片的大小,也就是高度和寬度,然后新建一個顯示驗證碼的窗體,我這里取名為showCode

在showCode上放一個webBrowser,高度和寬度設置為驗證碼圖片的高度和寬度。比如:

AllowWebBrowserDrop=false //控件不能拖動
ScrollBarsEnabled = false //取消滾動條
size = 120,40 驗證碼圖片的高度

然后找到webbrowser中的圖片。設置樣式。使其顯示在最右上角

img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px";

窗體布局:

核心代碼

 1  public partial class showCode : Form
 2     {
 3         public Image p { get; set; }
 4         public string showCodeUrl { get; set; } //顯示驗證碼頁面
 5         public delegate void delegateCode(string code, string uuid);
 6         public delegateCode codeHandler;
 7 
 8 
 9         public showCode()
10         {
11             InitializeComponent();
12             //InitializeEvents();
13         }
14         /// <summary>
15         /// 初始化
16         /// </summary>
17         //private void InitializeEvents()
18         //{
19         //    this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);
20         //}
21 
22         void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
23         {
24             WebBrowser bro = (WebBrowser)sender;
25 
26             HtmlElement img = bro.Document.GetElementById("imgCode");
27 
28             bro.Document.GetElementById("uuid").GetAttribute("value");
29 
30             img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px"; //使其顯示在最右上角
31             img.SetAttribute("onclick", "javascript:void(0)"); //取消單擊圖片刷新驗證碼操作
32         }
33         private void btnOk_Click(object sender, EventArgs e)
34         {
35             string code = textCode.Text;
36             if (string.IsNullOrEmpty(code))
37             {
38                 MessageBox.Show("請輸入驗證碼", "驗證碼", MessageBoxButtons.OK, MessageBoxIcon.Information);
39                 textCode.Focus();
40                 return;
41             }
42             if (codeHandler != null)
43             {
44                 string uuid = webBrowser.Document.GetElementById("uuid").GetAttribute("value");
45 
46                 this.DialogResult = DialogResult.OK;
47                 codeHandler(code, uuid);
48             }
49         }
50 
51         private void showCode_Load(object sender, EventArgs e)
52         {
53             //pictureBoxCode.Image = p;
54             webBrowser.Url = new Uri(showCodeUrl);
55             this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);
56         }
57     }

 

 

我這里定義了一個委托。利用回調機制,把驗證碼和uuid傳給主窗體,這里顯示驗證碼的url由主窗體傳進來。

 

當遇到驗證碼的時候,就會彈窗,如果能做到自動識別就更好了。

 

當由列表頁面抓取詳細頁面的時候,返回的html就是驗證碼頁面的源碼,這時候判斷html中是否包含“驗證碼”關鍵字,

包含的話。則實例化窗口。把顯示驗證碼的url傳給顯示驗證碼的窗體,並顯示。

 

    showCode code = new showCode();
    code.codeHandler = new HttpWebHelper().postVerCode; //子窗體委托回調方法
    code.showCodeUrl = responseUrl;//子窗體顯示驗證碼的url

 

//等待5秒,防止抓取頻率過高 時間根據當前的環境來定
                    Thread.Sleep(5000);

                    //抓取詳細頁面。這里如果過於頻繁,會跳到輸入驗證碼頁面
                    string sonHtml = client.webClient(referer);

                    //Monitor.Enter(this);

                    if (sonHtml.Contains("驗證碼"))
                    {
                        //這里的代碼可以封裝起來
                        /*
                         * 當遇到驗證碼后,我在抓取一次,以獲取我需要的信息,
                         * 比如這里登錄成功后有一個回調的url,我需要獲得這個url。
                         * 比如下面的rediect字段
                         */
                        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer);
                        request.Method = "get";
                        string responseUrl = string.Empty;
                        string rediect = string.Empty;
                        using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse())
                        {
                            //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml"
                            responseUrl = response1.ResponseUri.ToString();

                            //獲取絕對路徑 "/firewall/valid/1032910901.do"
                            string absolutePath = response1.ResponseUri.AbsolutePath;

                           //ResponseUri.Authority  "support.58.com" 拼接成 post到服務器驗證的完整路徑
                            HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath;

                            //獲取?后面的字符串
                            string query = response1.ResponseUri.Query;

                            //驗證碼成功后,重定向的url
                            rediect = query.Substring(query.LastIndexOf("=") + 1);
                        }
                        
                        this.Invoke(new Action(() =>
                        {

                            showCode code = new showCode();
                            code.codeHandler = new HttpWebHelper().postVerCode;//子窗體委托回調方法
                            code.showCodeUrl = responseUrl; //子窗體顯示驗證碼的url
                            if (code.ShowDialog() == DialogResult.OK)
                            {
                                code.Hide();
                                if (HttpWebHelper.isPass)//說明驗證碼 驗證成功
                                {
                                    sonHtml = client.webClient(rediect);

                                    getTable(sonHtml, ref dt);
                                }
                            }
                        }));
                    }

 

 

好了。現在回到之前的問題上。現在需要抓取詳細頁面的數據,上面說了ThreadDownload只是抓取列表頁面。

現在定義一個方法DataTable getTable(string document, ref DataTable dt),這里的dt是ref類型。是之前需要用的。好像現在已經用不上了。大家可以根據自己的要求修改

getTable方法是接收傳來的詳細頁面。然后匹配信息:比如:用戶名,手機號碼,公司名稱

 1   private DataTable getTable(string document, ref DataTable dt)
 2         {
 3             try
 4             {
 5                 //if (IsDisposed) return null;
 6                 //this.Invoke(
 7                 //           new Action(() => { richTextBoxInfo.AppendText("正在下載\n"); })
 8                 //       );
 9 
10                 IHtmlDocument hd = new JumonyParser().Parse(document);
11                 //string company = hd.FindFirst("div[class=su_tit]").InnerText();
12 
13                 string company = "未知";
14                 string phone = "未知";
15                 string linkman = "未知";
16 
17                 //判斷是個人還是企業
18                 var su = hd.Find("ul[class=suUl]");
19 
20                 //頂部html包含聯系人。電話
21                 IHtmlDocument top = new JumonyParser().Parse(hd.FindFirst("ul[class=suUl]").InnerHtml());
22 
23                 if (su.Count() > 0)
24                 {
25                     if (top.Find("div[class=su_tit]").Count() > 0)
26                     {
27                         string txt = top.FindFirst("div[class=su_tit]").InnerText();
28                         if (txt.Contains("公司名稱"))
29                         {
30                             if (top.Find("div[class=su_con]").Count() > 0)
31                                 //company = top.FindFirst("div[class=su_con]").FindFirst("a").InnerText();
32                                 company = top.FindFirst("div[class=su_con]").InnerText().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0];
33                             if (top.Find("li:nth-child(1)").Count() > 0)
34                                 linkman = top.FindFirst("li:nth-child(2)").FindFirst("div[class=su_con]").FindFirst("a").InnerText();
35                             if (top.Find("span[class=l_phone]").Count() > 0)
36                                 phone = top.FindFirst("span[class=l_phone]").InnerText();
37                         }
38                         else if (txt.Contains("聯系人"))
39                         {
40                             if (top.Find("li:nth-child(1)").Count() > 0)
41                                 linkman = top.FindFirst("li:nth-child(1)").FindFirst("div[class=su_con]").InnerText();
42                             if (top.Find("li:nth-child(2)").Count() > 0)
43                                 phone = top.FindFirst("li:nth-child(2)").FindFirst("span[id=t_phone]").InnerText();
44                         }
45                     }
46                 }
47 
48                 DataRow row = dt.NewRow();
49                 row["公司名字"] = company;
50                 row["聯系電話"] = phone;
51                 row["聯系人"] = linkman;
52 
53                 dt.Rows.Add(row);
54 
55 
56                 return dt;
57             }
58             catch (Exception)
59             {
60 
61                 return null;
62             }
63         }

 

 

來看看入口函數,開啟異步調用。顯然是不讓窗體假死

 /// <summary>
        /// 開始抓取
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        /// 
        void btnStart_Click(object sender, EventArgs e)
        {
            btnStart.Enabled = false;

            //Dictionary<string, string> result = new Dictionary<string, string>();
            //string url = "http://sz.58.com/";
            //string keyword = "維修";

            string url = textBoxUrl.Text;
            string keyword = textBoxKeyword.Text;

            if (string.IsNullOrEmpty(url))
            {
                MessageBox.Show("請輸入要抓取的網址", "網址", MessageBoxButtons.OK, MessageBoxIcon.Information);
                textBoxUrl.Focus();
                return;
            }
            else if (string.IsNullOrEmpty(keyword))
            {
                MessageBox.Show("請輸入要抓取的關鍵字", "關鍵字", MessageBoxButtons.OK, MessageBoxIcon.Information);
                textBoxKeyword.Focus();
                return;
            }

            //string prefix = "http://sz.58.com";

            // 聲明一個異步委托去處理下載操作
            Action downloadAction = new Action(() =>
            {
                ProcessDownload(url, keyword);
            });

            //Action<string, string> an = new Action<string, string>(ProcessDownload);

            //聲明一個下載完成后的回調函數
            AsyncCallback callback = new AsyncCallback((asyncResult) =>
            {
                this.Invoke(
                         new Action(() => { richTextBoxInfo.AppendText("首頁關鍵字匹配完成,顯示在左側列表中.....\n"); })
                     );
            });
            downloadAction.BeginInvoke(callback, null);
        }

 

 

其余代碼

 

 /// <summary>
        /// 窗體關閉提醒
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        void Main_FormClosing(object sender, FormClosingEventArgs e)
        {
            if (MessageBox.Show("是否退出當前程序", "關閉", MessageBoxButtons.YesNo, MessageBoxIcon.Question) == DialogResult.No) e.Cancel = true;
            else Environment.Exit(0); //強制退出所以線程
        }

        /// <summary>
        /// 單擊左邊菜單欄
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        void listBoxMenu_MouseClick(object sender, MouseEventArgs e)
        {
            string txt = listBoxMenu.Text;
            if (tabControlWarp.TabPages.ContainsKey(txt) && !string.IsNullOrEmpty(txt))
            {
                //tabControlWarp.TabPages.Add(txt, txt); //創建選項卡
                tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt];//並且選中
            }
            //else tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt];
        }

 

 

項目中用到了NPOI導出excel,這里附上相關幫助類

  1  public class ExcelRender
  2     {
  3         /// <summary>
  4         /// 根據Excel列類型獲取列的值
  5         /// </summary>
  6         /// <param name="cell">Excel列</param>
  7         /// <returns></returns>
  8         private static string GetCellValue(ICell cell)
  9         {
 10             if (cell == null)
 11                 return string.Empty;
 12             switch (cell.CellType)
 13             {
 14                 case CellType.BLANK:
 15                     return string.Empty;
 16                 case CellType.BOOLEAN:
 17                     return cell.BooleanCellValue.ToString();
 18                 case CellType.ERROR:
 19                     return cell.ErrorCellValue.ToString();
 20                 case CellType.NUMERIC:
 21                 case CellType.Unknown:
 22                 default:
 23                     return cell.ToString();//This is a trick to get the correct value of the cell. NumericCellValue will return a numeric value no matter the cell value is a date or a number
 24                 case CellType.STRING:
 25                     return cell.StringCellValue;
 26                 case CellType.FORMULA:
 27                     try
 28                     {
 29                         HSSFFormulaEvaluator e = new HSSFFormulaEvaluator(cell.Sheet.Workbook);
 30                         e.EvaluateInCell(cell);
 31                         return cell.ToString();
 32                     }
 33                     catch
 34                     {
 35                         return cell.NumericCellValue.ToString();
 36                     }
 37             }
 38         }
 39 
 40         /// <summary>
 41         /// 自動設置Excel列寬
 42         /// </summary>
 43         /// <param name="sheet">Excel表</param>
 44         private static void AutoSizeColumns(ISheet sheet)
 45         {
 46            
 47             if (sheet.PhysicalNumberOfRows > 0)
 48             {
 49                 IRow headerRow = sheet.GetRow(0);
 50 
 51                 for (int i = 0, l = headerRow.LastCellNum; i < l; i++)
 52                 {
 53                     sheet.AutoSizeColumn(i);
 54                 }
 55             }
 56         }
 57 
 58         /// <summary>
 59         /// 保存Excel文檔流到文件
 60         /// </summary>
 61         /// <param name="ms">Excel文檔流</param>
 62         /// <param name="fileName">文件名</param>
 63         private static void SaveToFile(MemoryStream ms, string fileName)
 64         {
 65             using (FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write))
 66             {
 67                 byte[] data = ms.ToArray();
 68 
 69                 fs.Write(data, 0, data.Length);
 70                 fs.Flush();
 71 
 72                 data = null;
 73             }
 74         }
 75 
 76         /// <summary>
 77         /// 輸出文件到瀏覽器
 78         /// </summary>
 79         /// <param name="ms">Excel文檔流</param>
 80         /// <param name="context">HTTP上下文</param>
 81         /// <param name="fileName">文件名</param>
 82         private static void RenderToBrowser(MemoryStream ms, HttpContext context, string fileName)
 83         {
 84             if (context.Request.Browser.Browser == "IE")
 85                 fileName = HttpUtility.UrlEncode(fileName);
 86             context.Response.AddHeader("Content-Disposition", "attachment;fileName=" + fileName);
 87             context.Response.BinaryWrite(ms.ToArray());
 88         }
 89 
 90         /// <summary>
 91         /// DataReader轉換成Excel文檔流
 92         /// </summary>
 93         /// <param name="reader"></param>
 94         /// <returns></returns>
 95         public static MemoryStream RenderToExcel(IDataReader reader)
 96         {
 97             MemoryStream ms = new MemoryStream();
 98 
 99             using (reader)
100             {
101                 using (IWorkbook workbook = new HSSFWorkbook())
102                 {
103                     using (ISheet sheet = workbook.CreateSheet())
104                     {
105                         IRow headerRow = sheet.CreateRow(0);
106                         int cellCount = reader.FieldCount;
107 
108                         // handling header.
109                         for (int i = 0; i < cellCount; i++)
110                         {
111                             headerRow.CreateCell(i).SetCellValue(reader.GetName(i));
112                         }
113 
114                         // handling value.
115                         int rowIndex = 1;
116                         while (reader.Read())
117                         {
118                             IRow dataRow = sheet.CreateRow(rowIndex);
119 
120                             for (int i = 0; i < cellCount; i++)
121                             {
122                                 dataRow.CreateCell(i).SetCellValue(reader[i].ToString());
123                             }
124 
125                             rowIndex++;
126                         }
127 
128                         AutoSizeColumns(sheet);
129 
130                         workbook.Write(ms);
131                         ms.Flush();
132                         ms.Position = 0;
133                     }
134                 }
135             }
136             return ms;
137         }
138 
139         /// <summary>
140         /// DataReader轉換成Excel文檔流,並保存到文件
141         /// </summary>
142         /// <param name="reader"></param>
143         /// <param name="fileName">保存的路徑</param>
144         public static void RenderToExcel(IDataReader reader, string fileName)
145         {
146             using (MemoryStream ms = RenderToExcel(reader))
147             {
148                 SaveToFile(ms, fileName);
149             }
150         }
151 
152         /// <summary>
153         /// DataReader轉換成Excel文檔流,並輸出到客戶端
154         /// </summary>
155         /// <param name="reader"></param>
156         /// <param name="context">HTTP上下文</param>
157         /// <param name="fileName">輸出的文件名</param>
158         public static void RenderToExcel(IDataReader reader, HttpContext context, string fileName)
159         {
160             using (MemoryStream ms = RenderToExcel(reader))
161             {
162                 RenderToBrowser(ms, context, fileName);
163             }
164         }
165 
166         /// <summary>
167         /// DataTable轉換成Excel文檔流
168         /// </summary>
169         /// <param name="table"></param>
170         /// <returns></returns>
171         public static MemoryStream RenderToExcel(DataTable table)
172         {
173             MemoryStream ms = new MemoryStream();
174 
175             using (table)
176             {
177                 using (IWorkbook workbook = new HSSFWorkbook())
178                 {
179                     using (ISheet sheet = workbook.CreateSheet())
180                     {
181                         IRow headerRow = sheet.CreateRow(0);
182 
183                         // handling header.
184                         foreach (DataColumn column in table.Columns)
185                             headerRow.CreateCell(column.Ordinal).SetCellValue(column.Caption);//If Caption not set, returns the ColumnName value
186 
187                         // handling value.
188                         int rowIndex = 1;
189 
190                         foreach (DataRow row in table.Rows)
191                         {
192                             IRow dataRow = sheet.CreateRow(rowIndex);
193 
194                             foreach (DataColumn column in table.Columns)
195                             {
196                                 dataRow.CreateCell(column.Ordinal).SetCellValue(row[column].ToString());
197                             }
198 
199                             rowIndex++;
200                         }
201                         AutoSizeColumns(sheet);
202 
203                         workbook.Write(ms);
204                         ms.Flush();
205                         ms.Position = 0;
206                     }
207                 }
208             }
209             return ms;
210         }
211 
212         /// <summary>
213         /// DataTable轉換成Excel文檔流,並保存到文件
214         /// </summary>
215         /// <param name="table"></param>
216         /// <param name="fileName">保存的路徑</param>
217         public static void RenderToExcel(DataTable table, string fileName)
218         {
219             using (MemoryStream ms = RenderToExcel(table))
220             {
221                 SaveToFile(ms, fileName);
222             }
223         }
224 
225         /// <summary>
226         /// DataTable轉換成Excel文檔流,並輸出到客戶端
227         /// </summary>
228         /// <param name="table"></param>
229         /// <param name="response"></param>
230         /// <param name="fileName">輸出的文件名</param>
231         public static void RenderToExcel(DataTable table, HttpContext context, string fileName)
232         {
233             using (MemoryStream ms = RenderToExcel(table))
234             {
235                 RenderToBrowser(ms, context, fileName);
236             }
237         }
238 
239         /// <summary>
240         /// Excel文檔流是否有數據
241         /// </summary>
242         /// <param name="excelFileStream">Excel文檔流</param>
243         /// <returns></returns>
244         public static bool HasData(Stream excelFileStream)
245         {
246             return HasData(excelFileStream, 0);
247         }
248 
249         /// <summary>
250         /// Excel文檔流是否有數據
251         /// </summary>
252         /// <param name="excelFileStream">Excel文檔流</param>
253         /// <param name="sheetIndex">表索引號,如第一個表為0</param>
254         /// <returns></returns>
255         public static bool HasData(Stream excelFileStream, int sheetIndex)
256         {
257             using (excelFileStream)
258             {
259                 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream))
260                 {
261                     if (workbook.NumberOfSheets > 0)
262                     {
263                         if (sheetIndex < workbook.NumberOfSheets)
264                         {
265                             using (ISheet sheet = workbook.GetSheetAt(sheetIndex))
266                             {
267                                 return sheet.PhysicalNumberOfRows > 0;
268                             }
269                         }
270                     }
271                 }
272             }
273             return false;
274         }
275 
276         /// <summary>
277         /// Excel文檔流轉換成DataTable
278         /// 第一行必須為標題行
279         /// </summary>
280         /// <param name="excelFileStream">Excel文檔流</param>
281         /// <param name="sheetName">表名稱</param>
282         /// <returns></returns>
283         public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName)
284         {
285             return RenderFromExcel(excelFileStream, sheetName, 0);
286         }
287 
288         /// <summary>
289         /// Excel文檔流轉換成DataTable
290         /// </summary>
291         /// <param name="excelFileStream">Excel文檔流</param>
292         /// <param name="sheetName">表名稱</param>
293         /// <param name="headerRowIndex">標題行索引號,如第一行為0</param>
294         /// <returns></returns>
295         public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName, int headerRowIndex)
296         {
297             DataTable table = null;
298 
299             using (excelFileStream)
300             {
301                 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream))
302                 {
303                     using (ISheet sheet = workbook.GetSheet(sheetName))
304                     {
305                         table = RenderFromExcel(sheet, headerRowIndex);
306                     }
307                 }
308             }
309             return table;
310         }
311 
312         /// <summary>
313         /// Excel文檔流轉換成DataTable
314         /// 默認轉換Excel的第一個表
315         /// 第一行必須為標題行
316         /// </summary>
317         /// <param name="excelFileStream">Excel文檔流</param>
318         /// <returns></returns>
319         public static DataTable RenderFromExcel(Stream excelFileStream)
320         {
321             return RenderFromExcel(excelFileStream, 0, 0);
322         }
323 
324         /// <summary>
325         /// Excel文檔流轉換成DataTable
326         /// 第一行必須為標題行
327         /// </summary>
328         /// <param name="excelFileStream">Excel文檔流</param>
329         /// <param name="sheetIndex">表索引號,如第一個表為0</param>
330         /// <returns></returns>
331         public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex)
332         {
333             return RenderFromExcel(excelFileStream, sheetIndex, 0);
334         }
335 
336         /// <summary>
337         /// Excel文檔流轉換成DataTable
338         /// </summary>
339         /// <param name="excelFileStream">Excel文檔流</param>
340         /// <param name="sheetIndex">表索引號,如第一個表為0</param>
341         /// <param name="headerRowIndex">標題行索引號,如第一行為0</param>
342         /// <returns></returns>
343         public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex, int headerRowIndex)
344         {
345             DataTable table = null;
346 
347             using (excelFileStream)
348             {
349                 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream))
350                 {
351                     using (ISheet sheet = workbook.GetSheetAt(sheetIndex))
352                     {
353                         table = RenderFromExcel(sheet, headerRowIndex);
354                     }
355                 }
356             }
357             return table;
358         }
359 
360         /// <summary>
361         /// Excel表格轉換成DataTable
362         /// </summary>
363         /// <param name="sheet">表格</param>
364         /// <param name="headerRowIndex">標題行索引號,如第一行為0</param>
365         /// <returns></returns>
366         private static DataTable RenderFromExcel(ISheet sheet, int headerRowIndex)
367         {
368             DataTable table = new DataTable();
369 
370             IRow headerRow = sheet.GetRow(headerRowIndex);
371             int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells
372             int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1
373 
374             //handling header.
375             for (int i = headerRow.FirstCellNum; i < cellCount; i++)
376             {
377                 DataColumn column = new DataColumn(headerRow.GetCell(i).StringCellValue);
378                 table.Columns.Add(column);
379             }
380 
381             for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++)
382             {
383                 IRow row = sheet.GetRow(i);
384                 DataRow dataRow = table.NewRow();
385 
386                 if (row != null)
387                 {
388                     for (int j = row.FirstCellNum; j < cellCount; j++)
389                     {
390                         if (row.GetCell(j) != null)
391                             dataRow[j] = GetCellValue(row.GetCell(j));
392                     }
393                 }
394 
395                 table.Rows.Add(dataRow);
396             }
397 
398             return table;
399         }
400 
401         /// <summary>
402         /// Excel文檔導入到數據庫
403         /// 默認取Excel的第一個表
404         /// 第一行必須為標題行
405         /// </summary>
406         /// <param name="excelFileStream">Excel文檔流</param>
407         /// <param name="insertSql">插入語句</param>
408         /// <param name="dbAction">更新到數據庫的方法</param>
409         /// <returns></returns>
410         public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction)
411         {
412             return RenderToDb(excelFileStream, insertSql, dbAction, 0, 0);
413         }
414 
415         public delegate int DBAction(string sql, params IDataParameter[] parameters);
416 
417         /// <summary>
418         /// Excel文檔導入到數據庫
419         /// </summary>
420         /// <param name="excelFileStream">Excel文檔流</param>
421         /// <param name="insertSql">插入語句</param>
422         /// <param name="dbAction">更新到數據庫的方法</param>
423         /// <param name="sheetIndex">表索引號,如第一個表為0</param>
424         /// <param name="headerRowIndex">標題行索引號,如第一行為0</param>
425         /// <returns></returns>
426         public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction, int sheetIndex, int headerRowIndex)
427         {
428             int rowAffected = 0;
429             using (excelFileStream)
430             {
431                 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream))
432                 {
433                     using (ISheet sheet = workbook.GetSheetAt(sheetIndex))
434                     {
435                         StringBuilder builder = new StringBuilder();
436 
437                         IRow headerRow = sheet.GetRow(headerRowIndex);
438                         int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells
439                         int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1
440 
441                         for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++)
442                         {
443                             IRow row = sheet.GetRow(i);
444                             if (row != null)
445                             {
446                                 builder.Append(insertSql);
447                                 builder.Append(" values (");
448                                 for (int j = row.FirstCellNum; j < cellCount; j++)
449                                 {
450                                     builder.AppendFormat("'{0}',", GetCellValue(row.GetCell(j)).Replace("'", "''"));
451                                 }
452                                 builder.Length = builder.Length - 1;
453                                 builder.Append(");");
454                             }
455 
456                             if ((i % 50 == 0 || i == rowCount) && builder.Length > 0)
457                             {
458                                 //每50條記錄一次批量插入到數據庫
459                                 rowAffected += dbAction(builder.ToString());
460                                 builder.Length = 0;
461                             }
462                         }
463                     }
464                 }
465             }
466             return rowAffected;
467         }
468     }
View Code

 

 代碼沒什么高級的地方。關鍵是看邏輯是否清晰,我這里優化的還很多。數據采集無非就是異步委托,多線程同步等等。就看你怎么靈活運用。

 

看了評論有很多需要源碼的,源碼分享於此:http://pan.baidu.com/s/1HagB8  密碼:g4uw

源碼還有很多不足的地方,可以看出,代碼也有很多冗余的,很多注釋都沒時間去清理,

希望可以在你們的手上做得更好,而不是下載源碼后做一個僵屍放到自己的硬盤里面。

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM