1.最近在寫爬蟲的時候,有的數據是用HTML的<TABLE>披露的,披露的時候會包含rowspan和colspan,
下圖是個簡單的例子:
對應的HTML代碼如下:

<table border="1"> <tr> <td>Column1</td> <td>Column2</td> <td>Column3</td> <td>Column4</td> <td>Column5</td> <td>Column6</td> </tr> <tr> <td rowspan=3>1</td> <td>2</td> <td>3</td> <td>4</td> <td>5</td> <td>6</td> </tr> <tr> <td>7</td> <td rowspan=2 colspan=3>8</td> <td>9</td> </tr> <tr> <td>10</td> <td>11</td> </tr> </table>
2.我們需要的數據應該是下面這個樣子的,才比較方便處理
3.那么如何轉換呢,這里我們需要引用HtmlAgilityPack.dll
代碼如下:

using System; using System.Data; using System.Linq; using HtmlAgilityPack; namespace HtmlToDataTable { static class Program { /// <summary> /// 應用程序的主入口點。 /// </summary> [STAThread] static void Main() { const string hrml = "<table border=\"1\"><tr><td>Column1</td><td>Column2</td><td>Column3</td><td>Column4</td><td>Column5</td><td>Column6</td></tr><tr><td rowspan=3>1</td><td>2</td><td></td><td>4</td><td>5</td><td>6</td></tr><tr><td></td><td rowspan=2 colspan=3>7</td><td>9</td></tr><tr><td></td><td>8</td></tr></table>"; var dt = HtmlToDataTable(hrml); } public static DataTable HtmlToDataTable(string hrml) { const string nulltxt = "-yellow3gold-"; var dt = new DataTable(); var doc = new HtmlDocument(); doc.LoadHtml(hrml); var tList = doc.DocumentNode.SelectNodes("//table"); if (tList != null) { var table = tList[0]; var rows = table.SelectNodes("//tr"); if (rows != null) { var colCount = 0; foreach (var td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td")) { var attr = td.Attributes["colspan"]; var colspan = (attr != null) ? int.Parse(attr.Value) : 1; colCount = colCount + colspan; } var rowCount = rows.Count; var arr = new string[rowCount][]; for (var r = 0; r < rowCount; r++) { arr[r] = new string[colCount]; } //填充數據 for (var row = 0; row < rowCount; row++) { var tr = rows[row]; var cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); for (var column = 0; column < cols.Count; column++) { var cAttr = cols[column].Attributes["colspan"]; var colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1; var rAttr = cols[column].Attributes["rowspan"]; var rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1; var text = string.IsNullOrEmpty(cols[column].InnerText) ? nulltxt : cols[column].InnerText; var startColumn = 0; for (var i = 0; i < rowspan; i++) { for (var j = 0; j < colspan; j++) { var d = startColumn == 0 ? column : startColumn; if (string.IsNullOrEmpty(arr[row + i][d + j])) arr[row + i][d + j] = text; else { var t = column + j + 1; startColumn = t; while (true) { if (string.IsNullOrEmpty(arr[row][t])) { arr[row][t] = text; break; } t++; } } } } } } for (var i = 0; i < arr.Length; i++) { if (i == 0) { for (var j = 0; j < arr[i].Length; j++) { var columnTxt = arr[i][j] == nulltxt ? "Column" + j : arr[i][j]; dt.Columns.Add(columnTxt); } } else { var row = dt.NewRow(); for (var k = 0; k < arr[i].Length; k++) { var columnTxt = arr[i][k] == nulltxt ? "" : arr[i][k]; row[k] = columnTxt; } dt.Rows.Add(row); } } } } return dt; } } }
4.轉換成DataTable入庫就比較方便了。
代碼未經嚴格測試,如有不當之處,敬請指出!