asp.net 網頁抓取內容


網頁抓取代碼

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
//
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;

namespace WSYL.Web.Common
{
    public static class GetSteamShipInfo
    {
        public static string GetWebSite(string steamshipname,int itype)
        {
            if (steamshipname == null || steamshipname.Trim() == "")
                return null;
            //step1: get html from url
            string urlToCrawl = @"網址";
            //generate http request
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
            //use GET method to get url's html
            req.Method = "GET";
            //use request to get response
            HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
            // 二〇一五年八月十二日 18:14:45 需要增加判斷網頁解析超時問題 防止網頁假死
            // string htmlCharset = "UTF-8";
            string htmlCharset = "utf-8";
            //use songtaste's html's charset GB2312 to decode html
            //otherwise will return messy code
            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
            //read out the returned html
            string respHtml = sr.ReadToEnd();
            //第三種獲取內容
            //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//需要獲取的代碼開始和結尾內容
Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html標簽和空格有些例外會使得去除不干凈,所以建議連續兩次轉化。這樣將Html標簽轉化為了空格。太多連續的空格會影響之后對字符串的操作 /// </summary> /// <param name="strHtml">標簽內容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格變為一個空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM