一個朋友問我能不能幫他做個小程序。抓取58上面包含"維修"的數據,比如公司名稱,電話號碼等等
打開58,收索"維修"
單擊 房屋維修,進入一個列表頁面,
隨便單擊一個,進入詳細頁面
需要請求58服務器3次。然后匹配html元素獲取自己需要的信息,數據匹配自然少不了正則表達式,用過的都知道,
對於我來說,寫正則表達式是非常頭疼的事情,所以可以選擇第三方庫:比如HtmlAgilityPack,Jumony等等,我這里選擇的是Jumony
博客園有對Jumony入門的文章: http://www.cnblogs.com/Ivony/archive/2010/12/19/jumony-guide-1.html
jumony直接安裝在項目中:
首先:選擇需要添加的項目,單擊引用,然后選擇管理NuGet程序包,在必要的情況下,需要升級NuGet
其次:收索Jumony安裝即可
先看看我實現的效果圖:因為公司比較忙,只能晚上回家寫寫,問題也是非常多。所有先記錄這兩天實現的效果
************************效果圖結束*****************************
主窗體:左側顯示的是在首頁匹配后的關鍵字。然后通過多線程月抓取每個列表頁面的信息。
看看我主窗體的布局
顯示數據的DatatGridView是動態創建的。
來看看核心代碼:模擬請求58服務器,就要去觀察58的請求與響應,可以通過Fiddler2和Firebug抓包觀察
我根據我項目的需求封裝了一個HttpWebHelper類,
1 /// <summary> 2 /// 封裝Http類 3 /// </summary> 4 class HttpWebHelper 5 { 6 /// <summary> 7 /// 顯示驗證碼頁面容器 8 /// </summary> 9 public static WebBrowser webBrowser { get; set; } 10 11 /// <summary> 12 /// 驗證碼需要的唯一id 13 /// </summary> 14 public static string uuid { get; set; } 15 16 /// <summary> 17 /// 驗證碼是否通過 18 /// </summary> 19 public static bool isPass { get; set; } 20 21 /// <summary> 22 /// 首頁關鍵字對應的url 23 /// </summary> 24 public static Dictionary<string, string> list; 25 26 /// <summary> 27 /// 抓取頁面前綴 28 /// </summary> 29 public static string prefix; 30 /// <summary> 31 /// 顯示驗證碼頁面 32 /// </summary> 33 public string codeUrl { get; set; } 34 /// <summary> 35 /// 抓取首頁 36 /// </summary> 37 public string dataUrl { get; set; } 38 /// <summary> 39 /// 驗證碼提交頁面 40 /// </summary> 41 public static string verCode { get; set; } 42 43 /// <summary> 44 /// 頁面請求方式 45 /// </summary> 46 public string Method { get; set; } 47 /// <summary> 48 /// RefererHTTP 表頭值 49 /// </summary> 50 public string Referer { get; set; } 51 /// <summary> 52 /// 主機 53 /// </summary> 54 public string Host { get; set; } 55 /// <summary> 56 /// cookie 57 /// </summary> 58 public CookieContainer cookie { get; set; } 59 /// <summary> 60 /// 61 /// </summary> 62 public string Accept { get; set; } 63 public string UserAgent { get; set; } 64 public string ContentType { get; set; } 65 public string Accept_Language { get; set; } 66 public Encoding encoding { get; set; } 67 68 public Image PictureBox { get; set; } 69 70 71 72 73 public HttpWebHelper() 74 { 75 this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do"; 76 //this.verCode = "http://support.58.com/firewall/valid/3071088800.do"; 77 78 Method = "post"; 79 Referer = "http://support.58.com/firewall/valid/3071088800.do"; 80 Host = "support.58.com"; 81 Accept_Language = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"; 82 Accept = "*/*"; 83 ContentType = "application/x-www-form-urlencoded; charset=UTF-8"; 84 UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0"; 85 encoding = Encoding.UTF8; 86 87 88 } 89 90 public HttpWebHelper(WebBrowser webBrowser, string uuid, string dataUrl) 91 { 92 //this.codeUrl = "http://support.58.com/firewall/valid/3071088800.do"; 93 //HttpWebHelper.webBrowser = webBrowser; 94 //this.uuid = uuid; 95 this.dataUrl = dataUrl; 96 } 97 98 /// <summary> 99 /// 驗證 驗證碼,驗證碼和頁面生成的一個id值同時post到服務器 100 /// </summary> 101 /// <param name="code">驗證碼</param> 102 public void postVerCode(string code, string uuid) 103 { 104 try 105 { 106 //HtmlElement d = webBrowser.Document.GetElementById("uuid"); 107 108 //獲取頁面uid。 109 /* 110 * 驗證方式:驗證碼和頁面生成的一個id值 111 */ 112 //string y = webBrowser.Document.GetElementById("uuid").GetAttribute("value"); 113 114 // string postUrl = "http://support.58.com/firewall/valid/3071088800.do"; 115 HttpWebHelper h = new HttpWebHelper(); 116 117 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(verCode); 118 request.Method = Method; 119 request.Referer = Referer; 120 request.Headers.Add("X-Requested-With", "XMLHttpRequest"); 121 request.Host = Host; 122 CookieContainer cookie = new CookieContainer(); 123 request.CookieContainer = cookie; 124 request.Accept = Accept; 125 request.ContentType = ContentType; 126 request.Headers.Add("Accept-Language", Accept_Language); 127 request.UserAgent = UserAgent; 128 string parameter = string.Format("inputcode={0}&namespace=infodetailweb&uuid={1}", HttpUtility.UrlEncode(code), uuid); 129 130 byte[] buffer = Encoding.Default.GetBytes(parameter); 131 132 string result = string.Empty; 133 Stream reqStr = request.GetRequestStream(); 134 reqStr.Write(buffer, 0, buffer.Length); 135 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 136 { 137 138 using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding)) 139 { 140 result = reader.ReadToEnd().Trim(); 141 } 142 } 143 HttpWebHelper.isPass = (result == "1" ? true : false); 144 } 145 catch (Exception ex) 146 { 147 MessageBox.Show(ex.StackTrace); 148 } 149 } 150 151 /// <summary> 152 /// WebClient簡單下載頁面 153 /// </summary> 154 /// <param name="url">下載html的頁面</param> 155 /// <returns></returns> 156 public string webClient(string url) 157 { 158 string html = string.Empty; 159 try 160 { 161 //WebClient client = new WebClient(); 162 //client.Encoding = encoding; 163 //string html = client.DownloadString(url); 164 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 165 166 request.Method = "get"; 167 //request.Timeout = 300; 168 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 169 { 170 using (StreamReader reader = new StreamReader(response1.GetResponseStream(), encoding)) 171 { 172 html = reader.ReadToEnd().Trim(); 173 } 174 } 175 } 176 catch (Exception ex) 177 { 178 MessageBox.Show(ex.StackTrace); 179 } 180 return html; 181 } 182 }
模擬請求類有了。接下來就是在返回的htlm中抓取關鍵字,這里是匹配包含 "維修" 的a 標簽
我封裝了一個方法,根據url和關鍵字抓取數據后。直接給窗體的控件listBoxMenu綁定數據
/// <summary> /// /// </summary> /// <param name="url">首頁抓取</param> /// <param name="keyword">首頁關鍵字</param> private void ProcessDownload(string url, string keyword) { this.Invoke( new Action(() => { richTextBoxInfo.AppendText(url + "開始下載中......\n"); }) ); //抓取關鍵字對應的url WebClient client = new WebClient(); string html = client.DownloadString(url); IHtmlDocument document = new JumonyParser().Parse(html); IEnumerable<IHtmlElement> result = document.Find("a").Where(t => t.InnerText().Contains(keyword)); Dictionary<string, string> dir = new Dictionary<string, string>(); foreach (var item in result) { var href = item.Attribute("href").Value(); var text = item.InnerText(); if (!dir.ContainsKey(href)) dir.Add(text, href); } //左邊菜單欄賦值 this.Invoke(new Action(() => { foreach (var item in dir) { listBoxMenu.Items.Add(item.Key); } })); //共享數據 HttpWebHelper.list = dir; HttpWebHelper.prefix = url; //開啟多線程下載。 //foreach (var item in dir) //{ // Thread thread = new Thread(() => { DownloadHtml(item.Key); }); // thread.Name = item.Key; //線程取名字 //} try { foreach (var item in dir) { //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key); Thread thread = new Thread(ThreadDownload); thread.Name = item.Key; thread.Start(item.Key + "," + item.Value); } } catch (Exception ex) { MessageBox.Show(ex.StackTrace); } }
這個void ProcessDownload(string url, string keyword)有幾點注意。這個方法是異步調用的。所以在這里給窗體的控件賦值,就屬於跨線程操作UI,因為UI是在主線程中創建和繪制的
有關跨線程問題可以看此篇博文:http://www.cnblogs.com/nsky/p/4436309.html
可以看到里面是有用到線程池的 :ThreadPool,后來被我注釋了。因為我需要給線程命名。但線程池我沒找到此方法。是不是沒有呢?
在ProcessDownload方法里面。當首頁關鍵字匹配后,根據匹配的個數,開啟多線程執行詳細頁面抓取,首頁的關鍵字我保存在了字典里面
Dictionary<string, string> dir = new Dictionary<string, string>(); 分別用關鍵字和關鍵字對應的url來存取key-value。在HttpWebHelper類中。我也定義了static
try { foreach (var item in dir) { //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadHtml), item.Key); Thread thread = new Thread(ThreadDownload); thread.Name = item.Key; thread.Start(item.Key + "," + item.Value); } } catch (Exception ex) { MessageBox.Show(ex.StackTrace); }
這里把key-value傳值給ThreadDownload。
了解多線程可以看博文:http://www.cnblogs.com/nsky/p/4425286.html
首頁抓取關鍵字的方法有了。那還缺一個什么方法?還需要一個抓取顯示列表的頁面,這里取名為:ThreadDownload方法
1 /// <summary> 2 /// 3 /// </summary> 4 /// <param name="title">當前抓取的關鍵字</param> 5 private void ThreadDownload(object obj) 6 { 7 //因為58有采集頻率限制。所以改成同步 8 Monitor.Enter(this); 9 10 string[] ob = obj.ToString().Split(','); 11 this.Invoke( 12 new Action(() => { richTextBoxInfo.AppendText(string.Format("正在抓取:{0}\n", ob[0])); }) 13 ); 14 Dictionary<string, string> list = HttpWebHelper.list; 15 string prefix = HttpWebHelper.prefix; 16 17 18 HttpWebHelper client = new HttpWebHelper(); 19 client.encoding = Encoding.UTF8; 20 //client.webClient(prefix); 21 22 23 DataTable dt = new DataTable(); 24 dt.Columns.Add("公司名字", typeof(string)); 25 dt.Columns.Add("聯系人", typeof(string)); 26 dt.Columns.Add("聯系電話", typeof(string)); 27 28 //遍歷每個信息對象的url 如:家庭維修==》 www.baidu.com 29 //foreach (var item in list) 30 //{ 31 //獲取列表 32 string fullurl = string.Format("{0}{1}", prefix, ob[1]); 33 string html = client.webClient(fullurl); 34 35 IHtmlDocument document = new JumonyParser().Parse(html); 36 IEnumerable<IHtmlElement> result = document.Find("table[id=jingzhun]"); 37 38 var items = result.Find("tr"); 39 40 foreach (var o in items) 41 { 42 if (o.Find("a").Count() > 0) 43 { 44 /* 45 * 執行該url的時候。服務器判斷了請求的頻繁度,需要輸入驗證碼。 46 * 輸入驗證碼成功后。會執行該url 即下面的referer 47 */ 48 //列表中找到a標簽轉到詳細頁面 49 string referer = o.FindFirst("a").Attribute("href").Value(); 50 51 52 //http://support.58.com/firewall/valid/1032910901.do?namespace=infodetailweb&url=http://sz.58.com/qichejx/19720429696131x.shtml 53 54 //等待5秒,防止抓取頻率過高 時間根據當前的環境來定 55 Thread.Sleep(5000); 56 57 58 59 string n = Thread.CurrentThread.Name; 60 string i = Thread.CurrentThread.ManagedThreadId.ToString(); 61 62 //抓取詳細頁面。這里如果過於頻繁,會跳到輸入驗證碼頁面 63 string sonHtml = client.webClient(referer); 64 65 //Monitor.Enter(this); 66 67 if (sonHtml.Contains("驗證碼")) 68 { 69 70 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer); 71 request.Method = "get"; 72 string responseUrl = string.Empty; 73 string rediect = string.Empty; 74 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 75 { 76 //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml" 77 responseUrl = response1.ResponseUri.ToString(); 78 79 //獲取絕對路徑 "/firewall/valid/1032910901.do" 80 string absolutePath = response1.ResponseUri.AbsolutePath; 81 82 //ResponseUri.Authority "support.58.com" 83 HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath; 84 85 //獲取?后面的字符串 86 string query = response1.ResponseUri.Query; 87 88 //驗證碼成功后,重定向的url 89 rediect = query.Substring(query.LastIndexOf("=") + 1); 90 } 91 //response1.ResponseUri.GetComponents(UriComponents.Query, UriFormat.UriEscaped); 92 //HttpWebHelper http = new HttpWebHelper(); 93 //HttpWebHelper.webBrowser = new WebBrowser(); 94 //HttpWebHelper.webBrowser.Url = new Uri(http.codeUrl); 95 96 //http.webBrowser.Navigate(http.codeUrl); 97 //HttpWebHelper.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 98 //HttpWebHelper.webBrowser.NewWindow += new CancelEventHandler(webBrowser_NewWindow); 99 //http://blog.csdn.net/jinjazz/article/details/1916883 100 //while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); } 101 102 //Thread thread = new Thread(() => 103 //{ 104 // showCode code = new showCode(); 105 // code.codeHandler = new HttpWebHelper().postVerCode; 106 // //code.p = h.PictureBox; 107 // if (code.ShowDialog() == DialogResult.OK) 108 // { 109 // code.Hide(); 110 // } 111 //}); 112 113 this.Invoke(new Action(() => 114 { 115 116 showCode code = new showCode(); 117 code.codeHandler = new HttpWebHelper().postVerCode; 118 code.showCodeUrl = responseUrl; 119 //code.p = h.PictureBox; 120 //this.dia 121 if (code.ShowDialog() == DialogResult.OK) 122 { 123 code.Hide(); 124 if (HttpWebHelper.isPass) 125 { 126 sonHtml = client.webClient(rediect); 127 128 getTable(sonHtml, ref dt); 129 } 130 } 131 //waitHandle.Set(); 132 133 //waitHandle.WaitOne(); 134 })); 135 //waitHandle.WaitOne(); 136 } 137 else 138 getTable(sonHtml, ref dt); 139 140 //獲取當前線程 141 Thread th = Thread.CurrentThread; 142 string name = th.Name; 143 144 this.Invoke(new Action(() => 145 { 146 //MessageBox.Show(name.ToString()); 147 148 149 //創建tab選項卡,如果不存在 150 if (!tabControlWarp.TabPages.ContainsKey(name)) 151 tabControlWarp.TabPages.Add(name, name); 152 153 //動態創建選項卡中顯示的數據,和一些屬性設置 154 DataGridView view = new DataGridView(); 155 view.AllowUserToAddRows = false; 156 view.AllowUserToDeleteRows = false; 157 view.AllowUserToResizeColumns = false; 158 view.AllowUserToResizeRows = false; 159 view.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill; 160 view.ColumnHeadersHeightSizeMode = DataGridViewColumnHeadersHeightSizeMode.AutoSize; 161 view.MultiSelect = false; 162 view.ReadOnly = true; 163 view.RowHeadersVisible = false; 164 view.BackgroundColor = Color.White; 165 view.ScrollBars = ScrollBars.Vertical; 166 view.SelectionMode = DataGridViewSelectionMode.FullRowSelect; 167 view.Dock = DockStyle.Fill; 168 view.DataSource = dt; 169 //把DataGridView添加到當前選項卡 170 tabControlWarp.TabPages[name].Controls.Add(view); 171 172 //刷新窗體,否則DataGridView數據沒有變化 173 this.Refresh(); 174 })); 175 } 176 } 177 //當前線程執行完畢,把當前的數據導出為excel 178 ExcelRender.ExcelRender.RenderToExcel(dt, ob[0] + ".xls"); 179 Monitor.Exit(this); 180 }
這個地方有一個難點就是,如果你采集的頻率過高,58會跳轉到一個驗證碼登錄頁面。這里本來是用多線程執行異步任務,
但:比如同時在執行采集 "手機維修"和"電腦維修"的時候。只要"手機維修"遇到驗證碼的時候,顯然"電腦維修"也會遇到。會有很多不確定的因素,
因為是多線程異步操作,當我彈窗讓用戶輸入驗證碼的代碼,同樣會執行多次。
所以找了采取了線程同步 。我用了 Monitor.Enter(this);實現同步。當然你可以用更簡單的lock關鍵字可以實現同樣的效果。
說到驗證碼。58算是下了大功夫,都知道58信息量的巨大。采集的人肯定多。58驗證碼的機制是。當跳轉到驗證碼登錄頁面,
頁面會生成唯一一個uuid,和一個驗證碼post到服務器的url和顯示驗證碼有相關聯的信息,下面會說明
從圖片中可以看出來,顯示驗證碼中的url和post到服務器中的url都包含 1032910901。這是重點,當你提交驗證碼的時候,服務器會驗證 這個 數字 和uuid如果不匹配則驗證錯誤。
你要記住:這個數字和uuid每次都是不同的。
那我這里是怎么顯示驗證碼的呢?
首先我是用最普通也是最大眾的方式。
用HttpWebRequest讀取,其實當HttpWebRequest讀取的時候,服務器的驗證碼已經變了。
當跳轉到驗證碼登錄頁面。服務器就已經記住了uuid,url中的數字 和驗證碼,當你用HttpWebRequest去獲取驗證碼肯定
和之前的驗證碼不同。
除了這種方式,網上也提到了好幾種方式,這里驗證成功后,有一個回調方法
可以通過HttpWebResponse獲取響應請求的url。比如
1 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer); 2 request.Method = "get"; 3 string responseUrl = string.Empty; 4 string rediect = string.Empty; 5 using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) 6 { 7 //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml" 8 responseUrl = response1.ResponseUri.ToString(); 9 10 //獲取絕對路徑 "/firewall/valid/1032910901.do" 11 string absolutePath = response1.ResponseUri.AbsolutePath; 12 13 //ResponseUri.Authority "support.58.com" 14 HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath; 15 16 //獲取?后面的字符串 17 string query = response1.ResponseUri.Query; 18 19 //驗證碼成功后,重定向的url 20 rediect = query.Substring(query.LastIndexOf("=") + 1); 21 }
第一種:頁面在WebBrowser中打開。讀取驗證碼圖片流。保存在剪切板中
1 /// <summary> 2 /// 返回指定WebBrowser中圖片<IMG></IMG>中的圖內容 3 /// </summary> 4 /// <param name="WebCtl">WebBrowser控件</param> 5 /// <param name="ImgeTag">IMG元素</param> 6 /// <returns>IMG對象</returns> 7 private Image GetWebImage(WebBrowser WebCtl, HtmlElement ImgeTag) 8 { 9 10 /* 11 * 這種方法有時候會因為剪切板沒有頭像而報異常, 12 * 初步判斷是頁面(我這里是js對圖片賦值)圖片沒有加載完成,而沒獲取到圖片 13 * System.Threading.Thread.Sleep(8000);測試通過。但每次時間是不確定的。 14 */ 15 16 HTMLDocument doc = (HTMLDocument)WebCtl.Document.DomDocument; 17 HTMLBody body = (HTMLBody)doc.body; 18 IHTMLControlRange rang = (IHTMLControlRange)body.createControlRange(); 19 IHTMLControlElement Img = (IHTMLControlElement)ImgeTag.DomElement; //圖片地址 20 Image oldImage = Clipboard.GetImage(); 21 rang.add(Img); 22 rang.execCommand("Copy", false, null); //拷貝到內存 23 Image numImage = Clipboard.GetImage(); //如果為null則保存 24 25 //判斷剪切板是否有圖片 26 //https://msdn.microsoft.com/zh-cn/library/system.windows.forms.clipboard.getimage.aspx 27 if (Clipboard.ContainsImage()) 28 { } 29 30 31 try 32 { 33 Clipboard.SetImage(oldImage); 34 } 35 catch (Exception ex) 36 { 37 MessageBox.Show(ex.Message); 38 } 39 return numImage; 40 }
調用代碼:
1 //找到圖片 2 HtmlElement ImgeTag = webBrowser1.Document.GetElementById("imgCode"); 3 4 Image numPic = GetWebImage(webBrowser1, ImgeTag); // 得到驗證碼圖片 5 pictureBox1.Image = numPic; //圖片賦值
HTMLDocument需要添加引用:F:\Program Files (x86)\Microsoft Visual Studio 12.0\Visual Studio Tools for Office\PIA\Common\Microsoft.mshtml.dll
引入命名空間:using mshtml;
顯然。頁面必須加載完成后才能獲取到圖片。即在事件中webBrowser1_DocumentCompleted獲取。但它卻不能判斷js腳本什么時候完成。
如果是多線程異步任務,還需要webBrowser1_DocumentCompleted執行后,在執行后面的方法,因為webBrowser1_DocumentCompleted本身就是異步的
此時的解決方案是 利用AutoResetEvent阻止線程,等當前線程執行完畢
AutoResetEvent waitHandle = new AutoResetEvent(false); while (waitHandle.WaitOne(10, false) == false) { Application.DoEvents(); }
第二種:抓圖。根據圖片的高寬來剪切
首先動態創建WebBrowser,並注冊事件
WebBrowser we = new WebBrowser(); we.Url = new Uri("http://support.58.com/firewall/valid/3071088800.do"); we.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(we_DocumentCompleted);
1 void we_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) 2 { 3 4 //HtmlElement d = webBrowser1.Document.GetElementById("uuid"); 5 6 //string y = webBrowser1.Document.GetElementById("uuid").GetAttribute("value"); 7 8 9 10 //var wb = new WebBrowser(); 11 12 HtmlElementCollection docs = we.Document.All; 13 foreach (HtmlElement item in docs) 14 { 15 string ii = item.Id; 16 17 if (item.Id == "uuid") 18 { 19 string c = item.GetAttribute("value"); 20 } 21 else if (item.Id == "imgCode") 22 { 23 HtmlElement img = item.Document.GetElementById("imgCode"); 24 item.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px"; 25 26 //抓圖 27 var b = new Bitmap(item.ClientRectangle.Width, item.ClientRectangle.Height); 28 we.DrawToBitmap(b, new Rectangle(new Point(), item.ClientRectangle.Size)); 29 pictureBox1.Image = b; 30 break; 31 32 } 33 } 34 }
第二種有個注意的地方:WebBrowser必須動態創建但不能依附於窗體上,即不將WebBrowser加載到窗體,否則截取后的圖片是顯示白色的。我也不知道什么原因
第3種:是根據第二種演化而來的,也是我當前用的。感覺有些投機取巧
你可以到顯示驗證碼頁面查看驗證碼圖片的大小,也就是高度和寬度,然后新建一個顯示驗證碼的窗體,我這里取名為showCode
在showCode上放一個webBrowser,高度和寬度設置為驗證碼圖片的高度和寬度。比如:
AllowWebBrowserDrop=false //控件不能拖動
ScrollBarsEnabled = false //取消滾動條
size = 120,40 驗證碼圖片的高度
然后找到webbrowser中的圖片。設置樣式。使其顯示在最右上角
img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px";
窗體布局:
核心代碼
1 public partial class showCode : Form 2 { 3 public Image p { get; set; } 4 public string showCodeUrl { get; set; } //顯示驗證碼頁面 5 public delegate void delegateCode(string code, string uuid); 6 public delegateCode codeHandler; 7 8 9 public showCode() 10 { 11 InitializeComponent(); 12 //InitializeEvents(); 13 } 14 /// <summary> 15 /// 初始化 16 /// </summary> 17 //private void InitializeEvents() 18 //{ 19 // this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 20 //} 21 22 void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) 23 { 24 WebBrowser bro = (WebBrowser)sender; 25 26 HtmlElement img = bro.Document.GetElementById("imgCode"); 27 28 bro.Document.GetElementById("uuid").GetAttribute("value"); 29 30 img.Style = "position: absolute; z-index: 9999; top: 0px; left: 0px"; //使其顯示在最右上角 31 img.SetAttribute("onclick", "javascript:void(0)"); //取消單擊圖片刷新驗證碼操作 32 } 33 private void btnOk_Click(object sender, EventArgs e) 34 { 35 string code = textCode.Text; 36 if (string.IsNullOrEmpty(code)) 37 { 38 MessageBox.Show("請輸入驗證碼", "驗證碼", MessageBoxButtons.OK, MessageBoxIcon.Information); 39 textCode.Focus(); 40 return; 41 } 42 if (codeHandler != null) 43 { 44 string uuid = webBrowser.Document.GetElementById("uuid").GetAttribute("value"); 45 46 this.DialogResult = DialogResult.OK; 47 codeHandler(code, uuid); 48 } 49 } 50 51 private void showCode_Load(object sender, EventArgs e) 52 { 53 //pictureBoxCode.Image = p; 54 webBrowser.Url = new Uri(showCodeUrl); 55 this.webBrowser.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted); 56 } 57 }
我這里定義了一個委托。利用回調機制,把驗證碼和uuid傳給主窗體,這里顯示驗證碼的url由主窗體傳進來。
當遇到驗證碼的時候,就會彈窗,如果能做到自動識別就更好了。
當由列表頁面抓取詳細頁面的時候,返回的html就是驗證碼頁面的源碼,這時候判斷html中是否包含“驗證碼”關鍵字,
包含的話。則實例化窗口。把顯示驗證碼的url傳給顯示驗證碼的窗體,並顯示。
showCode code = new showCode();
code.codeHandler = new HttpWebHelper().postVerCode; //子窗體委托回調方法
code.showCodeUrl = responseUrl;//子窗體顯示驗證碼的url
//等待5秒,防止抓取頻率過高 時間根據當前的環境來定 Thread.Sleep(5000); //抓取詳細頁面。這里如果過於頻繁,會跳到輸入驗證碼頁面 string sonHtml = client.webClient(referer); //Monitor.Enter(this); if (sonHtml.Contains("驗證碼")) { //這里的代碼可以封裝起來 /* * 當遇到驗證碼后,我在抓取一次,以獲取我需要的信息, * 比如這里登錄成功后有一個回調的url,我需要獲得這個url。 * 比如下面的rediect字段 */ HttpWebRequest request = (HttpWebRequest)WebRequest.Create(referer); request.Method = "get"; string responseUrl = string.Empty; string rediect = string.Empty; using (HttpWebResponse response1 = (HttpWebResponse)request.GetResponse()) { //"http://support.58.com/firewall/valid/1903444021.do?namespace=infodetailweb&url=http://sz.58.com/shoujiweixiu/21147587557513x.shtml" responseUrl = response1.ResponseUri.ToString(); //獲取絕對路徑 "/firewall/valid/1032910901.do" string absolutePath = response1.ResponseUri.AbsolutePath; //ResponseUri.Authority "support.58.com" 拼接成 post到服務器驗證的完整路徑 HttpWebHelper.verCode = "http://" + response1.ResponseUri.Authority + absolutePath; //獲取?后面的字符串 string query = response1.ResponseUri.Query; //驗證碼成功后,重定向的url rediect = query.Substring(query.LastIndexOf("=") + 1); } this.Invoke(new Action(() => { showCode code = new showCode(); code.codeHandler = new HttpWebHelper().postVerCode;//子窗體委托回調方法 code.showCodeUrl = responseUrl; //子窗體顯示驗證碼的url if (code.ShowDialog() == DialogResult.OK) { code.Hide(); if (HttpWebHelper.isPass)//說明驗證碼 驗證成功 { sonHtml = client.webClient(rediect); getTable(sonHtml, ref dt); } } })); }
好了。現在回到之前的問題上。現在需要抓取詳細頁面的數據,上面說了ThreadDownload只是抓取列表頁面。
現在定義一個方法DataTable getTable(string document, ref DataTable dt),這里的dt是ref類型。是之前需要用的。好像現在已經用不上了。大家可以根據自己的要求修改
getTable方法是接收傳來的詳細頁面。然后匹配信息:比如:用戶名,手機號碼,公司名稱
1 private DataTable getTable(string document, ref DataTable dt) 2 { 3 try 4 { 5 //if (IsDisposed) return null; 6 //this.Invoke( 7 // new Action(() => { richTextBoxInfo.AppendText("正在下載\n"); }) 8 // ); 9 10 IHtmlDocument hd = new JumonyParser().Parse(document); 11 //string company = hd.FindFirst("div[class=su_tit]").InnerText(); 12 13 string company = "未知"; 14 string phone = "未知"; 15 string linkman = "未知"; 16 17 //判斷是個人還是企業 18 var su = hd.Find("ul[class=suUl]"); 19 20 //頂部html包含聯系人。電話 21 IHtmlDocument top = new JumonyParser().Parse(hd.FindFirst("ul[class=suUl]").InnerHtml()); 22 23 if (su.Count() > 0) 24 { 25 if (top.Find("div[class=su_tit]").Count() > 0) 26 { 27 string txt = top.FindFirst("div[class=su_tit]").InnerText(); 28 if (txt.Contains("公司名稱")) 29 { 30 if (top.Find("div[class=su_con]").Count() > 0) 31 //company = top.FindFirst("div[class=su_con]").FindFirst("a").InnerText(); 32 company = top.FindFirst("div[class=su_con]").InnerText().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0]; 33 if (top.Find("li:nth-child(1)").Count() > 0) 34 linkman = top.FindFirst("li:nth-child(2)").FindFirst("div[class=su_con]").FindFirst("a").InnerText(); 35 if (top.Find("span[class=l_phone]").Count() > 0) 36 phone = top.FindFirst("span[class=l_phone]").InnerText(); 37 } 38 else if (txt.Contains("聯系人")) 39 { 40 if (top.Find("li:nth-child(1)").Count() > 0) 41 linkman = top.FindFirst("li:nth-child(1)").FindFirst("div[class=su_con]").InnerText(); 42 if (top.Find("li:nth-child(2)").Count() > 0) 43 phone = top.FindFirst("li:nth-child(2)").FindFirst("span[id=t_phone]").InnerText(); 44 } 45 } 46 } 47 48 DataRow row = dt.NewRow(); 49 row["公司名字"] = company; 50 row["聯系電話"] = phone; 51 row["聯系人"] = linkman; 52 53 dt.Rows.Add(row); 54 55 56 return dt; 57 } 58 catch (Exception) 59 { 60 61 return null; 62 } 63 }
來看看入口函數,開啟異步調用。顯然是不讓窗體假死
/// <summary> /// 開始抓取 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> /// void btnStart_Click(object sender, EventArgs e) { btnStart.Enabled = false; //Dictionary<string, string> result = new Dictionary<string, string>(); //string url = "http://sz.58.com/"; //string keyword = "維修"; string url = textBoxUrl.Text; string keyword = textBoxKeyword.Text; if (string.IsNullOrEmpty(url)) { MessageBox.Show("請輸入要抓取的網址", "網址", MessageBoxButtons.OK, MessageBoxIcon.Information); textBoxUrl.Focus(); return; } else if (string.IsNullOrEmpty(keyword)) { MessageBox.Show("請輸入要抓取的關鍵字", "關鍵字", MessageBoxButtons.OK, MessageBoxIcon.Information); textBoxKeyword.Focus(); return; } //string prefix = "http://sz.58.com"; // 聲明一個異步委托去處理下載操作 Action downloadAction = new Action(() => { ProcessDownload(url, keyword); }); //Action<string, string> an = new Action<string, string>(ProcessDownload); //聲明一個下載完成后的回調函數 AsyncCallback callback = new AsyncCallback((asyncResult) => { this.Invoke( new Action(() => { richTextBoxInfo.AppendText("首頁關鍵字匹配完成,顯示在左側列表中.....\n"); }) ); }); downloadAction.BeginInvoke(callback, null); }
其余代碼
/// <summary> /// 窗體關閉提醒 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void Main_FormClosing(object sender, FormClosingEventArgs e) { if (MessageBox.Show("是否退出當前程序", "關閉", MessageBoxButtons.YesNo, MessageBoxIcon.Question) == DialogResult.No) e.Cancel = true; else Environment.Exit(0); //強制退出所以線程 } /// <summary> /// 單擊左邊菜單欄 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void listBoxMenu_MouseClick(object sender, MouseEventArgs e) { string txt = listBoxMenu.Text; if (tabControlWarp.TabPages.ContainsKey(txt) && !string.IsNullOrEmpty(txt)) { //tabControlWarp.TabPages.Add(txt, txt); //創建選項卡 tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt];//並且選中 } //else tabControlWarp.SelectedTab = tabControlWarp.TabPages[txt]; }
項目中用到了NPOI導出excel,這里附上相關幫助類

1 public class ExcelRender 2 { 3 /// <summary> 4 /// 根據Excel列類型獲取列的值 5 /// </summary> 6 /// <param name="cell">Excel列</param> 7 /// <returns></returns> 8 private static string GetCellValue(ICell cell) 9 { 10 if (cell == null) 11 return string.Empty; 12 switch (cell.CellType) 13 { 14 case CellType.BLANK: 15 return string.Empty; 16 case CellType.BOOLEAN: 17 return cell.BooleanCellValue.ToString(); 18 case CellType.ERROR: 19 return cell.ErrorCellValue.ToString(); 20 case CellType.NUMERIC: 21 case CellType.Unknown: 22 default: 23 return cell.ToString();//This is a trick to get the correct value of the cell. NumericCellValue will return a numeric value no matter the cell value is a date or a number 24 case CellType.STRING: 25 return cell.StringCellValue; 26 case CellType.FORMULA: 27 try 28 { 29 HSSFFormulaEvaluator e = new HSSFFormulaEvaluator(cell.Sheet.Workbook); 30 e.EvaluateInCell(cell); 31 return cell.ToString(); 32 } 33 catch 34 { 35 return cell.NumericCellValue.ToString(); 36 } 37 } 38 } 39 40 /// <summary> 41 /// 自動設置Excel列寬 42 /// </summary> 43 /// <param name="sheet">Excel表</param> 44 private static void AutoSizeColumns(ISheet sheet) 45 { 46 47 if (sheet.PhysicalNumberOfRows > 0) 48 { 49 IRow headerRow = sheet.GetRow(0); 50 51 for (int i = 0, l = headerRow.LastCellNum; i < l; i++) 52 { 53 sheet.AutoSizeColumn(i); 54 } 55 } 56 } 57 58 /// <summary> 59 /// 保存Excel文檔流到文件 60 /// </summary> 61 /// <param name="ms">Excel文檔流</param> 62 /// <param name="fileName">文件名</param> 63 private static void SaveToFile(MemoryStream ms, string fileName) 64 { 65 using (FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write)) 66 { 67 byte[] data = ms.ToArray(); 68 69 fs.Write(data, 0, data.Length); 70 fs.Flush(); 71 72 data = null; 73 } 74 } 75 76 /// <summary> 77 /// 輸出文件到瀏覽器 78 /// </summary> 79 /// <param name="ms">Excel文檔流</param> 80 /// <param name="context">HTTP上下文</param> 81 /// <param name="fileName">文件名</param> 82 private static void RenderToBrowser(MemoryStream ms, HttpContext context, string fileName) 83 { 84 if (context.Request.Browser.Browser == "IE") 85 fileName = HttpUtility.UrlEncode(fileName); 86 context.Response.AddHeader("Content-Disposition", "attachment;fileName=" + fileName); 87 context.Response.BinaryWrite(ms.ToArray()); 88 } 89 90 /// <summary> 91 /// DataReader轉換成Excel文檔流 92 /// </summary> 93 /// <param name="reader"></param> 94 /// <returns></returns> 95 public static MemoryStream RenderToExcel(IDataReader reader) 96 { 97 MemoryStream ms = new MemoryStream(); 98 99 using (reader) 100 { 101 using (IWorkbook workbook = new HSSFWorkbook()) 102 { 103 using (ISheet sheet = workbook.CreateSheet()) 104 { 105 IRow headerRow = sheet.CreateRow(0); 106 int cellCount = reader.FieldCount; 107 108 // handling header. 109 for (int i = 0; i < cellCount; i++) 110 { 111 headerRow.CreateCell(i).SetCellValue(reader.GetName(i)); 112 } 113 114 // handling value. 115 int rowIndex = 1; 116 while (reader.Read()) 117 { 118 IRow dataRow = sheet.CreateRow(rowIndex); 119 120 for (int i = 0; i < cellCount; i++) 121 { 122 dataRow.CreateCell(i).SetCellValue(reader[i].ToString()); 123 } 124 125 rowIndex++; 126 } 127 128 AutoSizeColumns(sheet); 129 130 workbook.Write(ms); 131 ms.Flush(); 132 ms.Position = 0; 133 } 134 } 135 } 136 return ms; 137 } 138 139 /// <summary> 140 /// DataReader轉換成Excel文檔流,並保存到文件 141 /// </summary> 142 /// <param name="reader"></param> 143 /// <param name="fileName">保存的路徑</param> 144 public static void RenderToExcel(IDataReader reader, string fileName) 145 { 146 using (MemoryStream ms = RenderToExcel(reader)) 147 { 148 SaveToFile(ms, fileName); 149 } 150 } 151 152 /// <summary> 153 /// DataReader轉換成Excel文檔流,並輸出到客戶端 154 /// </summary> 155 /// <param name="reader"></param> 156 /// <param name="context">HTTP上下文</param> 157 /// <param name="fileName">輸出的文件名</param> 158 public static void RenderToExcel(IDataReader reader, HttpContext context, string fileName) 159 { 160 using (MemoryStream ms = RenderToExcel(reader)) 161 { 162 RenderToBrowser(ms, context, fileName); 163 } 164 } 165 166 /// <summary> 167 /// DataTable轉換成Excel文檔流 168 /// </summary> 169 /// <param name="table"></param> 170 /// <returns></returns> 171 public static MemoryStream RenderToExcel(DataTable table) 172 { 173 MemoryStream ms = new MemoryStream(); 174 175 using (table) 176 { 177 using (IWorkbook workbook = new HSSFWorkbook()) 178 { 179 using (ISheet sheet = workbook.CreateSheet()) 180 { 181 IRow headerRow = sheet.CreateRow(0); 182 183 // handling header. 184 foreach (DataColumn column in table.Columns) 185 headerRow.CreateCell(column.Ordinal).SetCellValue(column.Caption);//If Caption not set, returns the ColumnName value 186 187 // handling value. 188 int rowIndex = 1; 189 190 foreach (DataRow row in table.Rows) 191 { 192 IRow dataRow = sheet.CreateRow(rowIndex); 193 194 foreach (DataColumn column in table.Columns) 195 { 196 dataRow.CreateCell(column.Ordinal).SetCellValue(row[column].ToString()); 197 } 198 199 rowIndex++; 200 } 201 AutoSizeColumns(sheet); 202 203 workbook.Write(ms); 204 ms.Flush(); 205 ms.Position = 0; 206 } 207 } 208 } 209 return ms; 210 } 211 212 /// <summary> 213 /// DataTable轉換成Excel文檔流,並保存到文件 214 /// </summary> 215 /// <param name="table"></param> 216 /// <param name="fileName">保存的路徑</param> 217 public static void RenderToExcel(DataTable table, string fileName) 218 { 219 using (MemoryStream ms = RenderToExcel(table)) 220 { 221 SaveToFile(ms, fileName); 222 } 223 } 224 225 /// <summary> 226 /// DataTable轉換成Excel文檔流,並輸出到客戶端 227 /// </summary> 228 /// <param name="table"></param> 229 /// <param name="response"></param> 230 /// <param name="fileName">輸出的文件名</param> 231 public static void RenderToExcel(DataTable table, HttpContext context, string fileName) 232 { 233 using (MemoryStream ms = RenderToExcel(table)) 234 { 235 RenderToBrowser(ms, context, fileName); 236 } 237 } 238 239 /// <summary> 240 /// Excel文檔流是否有數據 241 /// </summary> 242 /// <param name="excelFileStream">Excel文檔流</param> 243 /// <returns></returns> 244 public static bool HasData(Stream excelFileStream) 245 { 246 return HasData(excelFileStream, 0); 247 } 248 249 /// <summary> 250 /// Excel文檔流是否有數據 251 /// </summary> 252 /// <param name="excelFileStream">Excel文檔流</param> 253 /// <param name="sheetIndex">表索引號,如第一個表為0</param> 254 /// <returns></returns> 255 public static bool HasData(Stream excelFileStream, int sheetIndex) 256 { 257 using (excelFileStream) 258 { 259 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 260 { 261 if (workbook.NumberOfSheets > 0) 262 { 263 if (sheetIndex < workbook.NumberOfSheets) 264 { 265 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 266 { 267 return sheet.PhysicalNumberOfRows > 0; 268 } 269 } 270 } 271 } 272 } 273 return false; 274 } 275 276 /// <summary> 277 /// Excel文檔流轉換成DataTable 278 /// 第一行必須為標題行 279 /// </summary> 280 /// <param name="excelFileStream">Excel文檔流</param> 281 /// <param name="sheetName">表名稱</param> 282 /// <returns></returns> 283 public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName) 284 { 285 return RenderFromExcel(excelFileStream, sheetName, 0); 286 } 287 288 /// <summary> 289 /// Excel文檔流轉換成DataTable 290 /// </summary> 291 /// <param name="excelFileStream">Excel文檔流</param> 292 /// <param name="sheetName">表名稱</param> 293 /// <param name="headerRowIndex">標題行索引號,如第一行為0</param> 294 /// <returns></returns> 295 public static DataTable RenderFromExcel(Stream excelFileStream, string sheetName, int headerRowIndex) 296 { 297 DataTable table = null; 298 299 using (excelFileStream) 300 { 301 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 302 { 303 using (ISheet sheet = workbook.GetSheet(sheetName)) 304 { 305 table = RenderFromExcel(sheet, headerRowIndex); 306 } 307 } 308 } 309 return table; 310 } 311 312 /// <summary> 313 /// Excel文檔流轉換成DataTable 314 /// 默認轉換Excel的第一個表 315 /// 第一行必須為標題行 316 /// </summary> 317 /// <param name="excelFileStream">Excel文檔流</param> 318 /// <returns></returns> 319 public static DataTable RenderFromExcel(Stream excelFileStream) 320 { 321 return RenderFromExcel(excelFileStream, 0, 0); 322 } 323 324 /// <summary> 325 /// Excel文檔流轉換成DataTable 326 /// 第一行必須為標題行 327 /// </summary> 328 /// <param name="excelFileStream">Excel文檔流</param> 329 /// <param name="sheetIndex">表索引號,如第一個表為0</param> 330 /// <returns></returns> 331 public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex) 332 { 333 return RenderFromExcel(excelFileStream, sheetIndex, 0); 334 } 335 336 /// <summary> 337 /// Excel文檔流轉換成DataTable 338 /// </summary> 339 /// <param name="excelFileStream">Excel文檔流</param> 340 /// <param name="sheetIndex">表索引號,如第一個表為0</param> 341 /// <param name="headerRowIndex">標題行索引號,如第一行為0</param> 342 /// <returns></returns> 343 public static DataTable RenderFromExcel(Stream excelFileStream, int sheetIndex, int headerRowIndex) 344 { 345 DataTable table = null; 346 347 using (excelFileStream) 348 { 349 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 350 { 351 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 352 { 353 table = RenderFromExcel(sheet, headerRowIndex); 354 } 355 } 356 } 357 return table; 358 } 359 360 /// <summary> 361 /// Excel表格轉換成DataTable 362 /// </summary> 363 /// <param name="sheet">表格</param> 364 /// <param name="headerRowIndex">標題行索引號,如第一行為0</param> 365 /// <returns></returns> 366 private static DataTable RenderFromExcel(ISheet sheet, int headerRowIndex) 367 { 368 DataTable table = new DataTable(); 369 370 IRow headerRow = sheet.GetRow(headerRowIndex); 371 int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells 372 int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1 373 374 //handling header. 375 for (int i = headerRow.FirstCellNum; i < cellCount; i++) 376 { 377 DataColumn column = new DataColumn(headerRow.GetCell(i).StringCellValue); 378 table.Columns.Add(column); 379 } 380 381 for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++) 382 { 383 IRow row = sheet.GetRow(i); 384 DataRow dataRow = table.NewRow(); 385 386 if (row != null) 387 { 388 for (int j = row.FirstCellNum; j < cellCount; j++) 389 { 390 if (row.GetCell(j) != null) 391 dataRow[j] = GetCellValue(row.GetCell(j)); 392 } 393 } 394 395 table.Rows.Add(dataRow); 396 } 397 398 return table; 399 } 400 401 /// <summary> 402 /// Excel文檔導入到數據庫 403 /// 默認取Excel的第一個表 404 /// 第一行必須為標題行 405 /// </summary> 406 /// <param name="excelFileStream">Excel文檔流</param> 407 /// <param name="insertSql">插入語句</param> 408 /// <param name="dbAction">更新到數據庫的方法</param> 409 /// <returns></returns> 410 public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction) 411 { 412 return RenderToDb(excelFileStream, insertSql, dbAction, 0, 0); 413 } 414 415 public delegate int DBAction(string sql, params IDataParameter[] parameters); 416 417 /// <summary> 418 /// Excel文檔導入到數據庫 419 /// </summary> 420 /// <param name="excelFileStream">Excel文檔流</param> 421 /// <param name="insertSql">插入語句</param> 422 /// <param name="dbAction">更新到數據庫的方法</param> 423 /// <param name="sheetIndex">表索引號,如第一個表為0</param> 424 /// <param name="headerRowIndex">標題行索引號,如第一行為0</param> 425 /// <returns></returns> 426 public static int RenderToDb(Stream excelFileStream, string insertSql, DBAction dbAction, int sheetIndex, int headerRowIndex) 427 { 428 int rowAffected = 0; 429 using (excelFileStream) 430 { 431 using (IWorkbook workbook = new HSSFWorkbook(excelFileStream)) 432 { 433 using (ISheet sheet = workbook.GetSheetAt(sheetIndex)) 434 { 435 StringBuilder builder = new StringBuilder(); 436 437 IRow headerRow = sheet.GetRow(headerRowIndex); 438 int cellCount = headerRow.LastCellNum;//LastCellNum = PhysicalNumberOfCells 439 int rowCount = sheet.LastRowNum;//LastRowNum = PhysicalNumberOfRows - 1 440 441 for (int i = (sheet.FirstRowNum + 1); i <= rowCount; i++) 442 { 443 IRow row = sheet.GetRow(i); 444 if (row != null) 445 { 446 builder.Append(insertSql); 447 builder.Append(" values ("); 448 for (int j = row.FirstCellNum; j < cellCount; j++) 449 { 450 builder.AppendFormat("'{0}',", GetCellValue(row.GetCell(j)).Replace("'", "''")); 451 } 452 builder.Length = builder.Length - 1; 453 builder.Append(");"); 454 } 455 456 if ((i % 50 == 0 || i == rowCount) && builder.Length > 0) 457 { 458 //每50條記錄一次批量插入到數據庫 459 rowAffected += dbAction(builder.ToString()); 460 builder.Length = 0; 461 } 462 } 463 } 464 } 465 } 466 return rowAffected; 467 } 468 }
代碼沒什么高級的地方。關鍵是看邏輯是否清晰,我這里優化的還很多。數據采集無非就是異步委托,多線程同步等等。就看你怎么靈活運用。
看了評論有很多需要源碼的,源碼分享於此:http://pan.baidu.com/s/1HagB8 密碼:g4uw
源碼還有很多不足的地方,可以看出,代碼也有很多冗余的,很多注釋都沒時間去清理,
希望可以在你們的手上做得更好,而不是下載源碼后做一個僵屍放到自己的硬盤里面。