C#將HTML表格

1.最近在寫爬蟲的時候,有的數據是用HTML的<TABLE>披露的,披露的時候會包含rowspan和colspan,

下圖是個簡單的例子:

 

對應的HTML代碼如下:

<table border="1">
<tr>
  <td>Column1</td>
  <td>Column2</td>
  <td>Column3</td>
  <td>Column4</td>
  <td>Column5</td>
  <td>Column6</td>
</tr>
<tr>
  <td rowspan=3>1</td>
  <td>2</td>
  <td>3</td>
  <td>4</td>
  <td>5</td>
  <td>6</td>
</tr>
<tr>
  <td>7</td>
  <td rowspan=2 colspan=3>8</td>
  <td>9</td>
</tr>
<tr>
  <td>10</td>
  <td>11</td>
</tr>
</table>
HTML Code

2.我們需要的數據應該是下面這個樣子的,才比較方便處理

3.那么如何轉換呢,這里我們需要引用HtmlAgilityPack.dll

代碼如下:

using System;
using System.Data;
using System.Linq;
using HtmlAgilityPack;

namespace HtmlToDataTable
{
    static class Program
    {
        /// <summary>
        /// 應用程序的主入口點。
        /// </summary>
        [STAThread]
        static void Main()
        {
            const string hrml = "<table border=\"1\"><tr><td>Column1</td><td>Column2</td><td>Column3</td><td>Column4</td><td>Column5</td><td>Column6</td></tr><tr><td rowspan=3>1</td><td>2</td><td></td><td>4</td><td>5</td><td>6</td></tr><tr><td></td><td rowspan=2 colspan=3>7</td><td>9</td></tr><tr><td></td><td>8</td></tr></table>";
            var dt = HtmlToDataTable(hrml);
        }

        public static DataTable HtmlToDataTable(string hrml)
        {
            const string nulltxt = "-yellow3gold-";
            var dt = new DataTable();
            var doc = new HtmlDocument();
            doc.LoadHtml(hrml);
            var tList = doc.DocumentNode.SelectNodes("//table");
            if (tList != null)
            {
                var table = tList[0];
                var rows = table.SelectNodes("//tr");
                if (rows != null)
                {
                    var colCount = 0;
                    foreach (var td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
                    {
                        var attr = td.Attributes["colspan"];
                        var colspan = (attr != null) ? int.Parse(attr.Value) : 1;
                        colCount = colCount + colspan;
                    }
                    var rowCount = rows.Count;
                    var arr = new string[rowCount][];
                    for (var r = 0; r < rowCount; r++)
                    {
                        arr[r] = new string[colCount];
                    }
                    //填充數據
                    for (var row = 0; row < rowCount; row++)
                    {
                        var tr = rows[row];
                        var cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
                        for (var column = 0; column < cols.Count; column++)
                        {
                            var cAttr = cols[column].Attributes["colspan"];
                            var colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
                            var rAttr = cols[column].Attributes["rowspan"];
                            var rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
                            var text = string.IsNullOrEmpty(cols[column].InnerText) ? nulltxt : cols[column].InnerText;
                            var startColumn = 0;
                            for (var i = 0; i < rowspan; i++)
                            {
                                for (var j = 0; j < colspan; j++)
                                {
                                    var d = startColumn == 0 ? column : startColumn;
                                    if (string.IsNullOrEmpty(arr[row + i][d + j]))
                                        arr[row + i][d + j] = text;
                                    else
                                    {
                                        var t = column + j + 1;
                                        startColumn = t;
                                        while (true)
                                        {
                                            if (string.IsNullOrEmpty(arr[row][t]))
                                            {
                                                arr[row][t] = text;
                                                break;
                                            }
                                            t++;
                                        }
                                    }
                                }
                            }
                        }
                    }
                    for (var i = 0; i < arr.Length; i++)
                    {
                        if (i == 0)
                        {
                            for (var j = 0; j < arr[i].Length; j++)
                            {
                                var columnTxt = arr[i][j] == nulltxt ? "Column" + j : arr[i][j];
                                dt.Columns.Add(columnTxt);
                            }
                        }
                        else
                        {
                            var row = dt.NewRow();
                            for (var k = 0; k < arr[i].Length; k++)
                            {
                                var columnTxt = arr[i][k] == nulltxt ? "" : arr[i][k];
                                row[k] = columnTxt;
                            }
                            dt.Rows.Add(row);
                        }
                    }
                }
            }
            return dt;
        }
    }
}
View Code

4.轉換成DataTable入庫就比較方便了。

 代碼未經嚴格測試,如有不當之處,敬請指出!


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM
轉換成DataTable