VBA分別使用MSXML的DOM屬性和XPATH進行網頁爬蟲


本文要重點介紹的是VBA中的XmlHttp對象(MSXML2.XMLHTTP或MSXML.XMLHTTP),它可以向http服務器發送請求並使用微軟XML文檔對象模型Microsoft XML Document Object Model (DOM)處理回應。練習抓取的網頁例子是https://www.qppstudio.net/public-holidays-by-date/month1.htm

第一種方法——DOM經典屬性:

參考http://club.excelhome.net/thread-1233167-1-1.htmlhttps://www.jianshu.com/p/1920550cb4a6

Sub Main()
ActiveSheet.Cells.Clear
Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
Set oHttp = CreateObject("MSXML2.XMLHTTP") '創建一個xmlhttp對象
Set odom = CreateObject("htmlfile") '創建一個Dom對象
With oHttp
'open,創建一個新的http請求,並指定此請求的方法、URL以及驗證信息(用戶名/密碼) 'send,發送請求到http服務器並接收回應 .Open "GET", Url, False '使用Open方法,用get請求,False代表非異步加載
    .Open "GET", Url, False '使用Open方法,用get請求,False代表非異步加載
    .send '將open方法的信息發送給網頁服務器
     odom.body.innerHTML = .responseText '將響應網頁的HTML賦值給Dom對象,並只需要body標簽里面的內容
End With
dom (odom)
End Sub
Sub dom(odom As Object) i = 2 For Each Item In odom.all If Item.className = "list-item" Then For Each itemch In Item.Children If itemch.className = "list-item-heading" Then Range("a" & i) = itemch.innerText ElseIf itemch.className = "list-subitem" Then Range("b" & i) = itemch.Children(1).innerText Range("c" & i) = itemch.Children(3).innerText i = i + 1 End If Next Exit For End If Next End Sub

第二種方法——轉換為XML並使用XPATH(比較麻煩):

參考http://club.excelhome.net/thread-1233167-1-1.html

Sub Main()
Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
Set oHttp = CreateObject("MSXML2.XMLHTTP") '創建一個xmlhttp對象
Set odom = CreateObject("htmlfile") '創建一個Dom對象
With oHttp
'open,創建一個新的http請求,並指定此請求的方法、URL以及驗證信息(用戶名/密碼) 'send,發送請求到http服務器並接收回應 .Open "GET", Url, False '使用Open方法,用get請求,False代表非異步加載
    .Open "GET", Url, False '使用Open方法,用get請求,False代表非異步加載
    .send '將open方法的信息發送給網頁服務器
     odom.body.innerHTML = .responseText '將響應網頁的HTML賦值給Dom對象,並只需要body標簽里面的內容
End With

'需要先將html文本進行格式化才能寫入xmldoc,才能使用自帶的xpath,比如節點一定要有開始和結束,節點屬性一定要用雙引號括起來
'例如
'sXML = "<NewDataSet class=""123""><MyTable>"
'sXML = sXML & "     <Active>true</Active>"
'sXML = sXML & "     <SQLServer>APCD03</SQLServer>"
'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
'sXML = sXML & " </MyTable>"
'sXML = sXML & " <MyTable>"
'sXML = sXML & "     <Active>false</Active>"
'sXML = sXML & "     <SQLServer>APCD04</SQLServer>"
'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
'sXML = sXML & " </MyTable></NewDataSet>"
'Debug.Print sXML
Dim sXML As String, xDoc, a, nodelist, node
For Each Item In odom.all
  If Item.className = "list-item" Then
    sXML = Item.outerHTML
    Exit For
  End If
Next
sXML = rr(sXML, "<IMG.*?>", "")
sXML = rr(sXML, "class=.*?>", ">")
Set xDoc = CreateObject("MSXML.DOMDocument")
a = xDoc.LoadXML(sXML)
'a為true時代表寫入成功,為false代表寫入失敗
'Debug.Print a
'一旦a為false就可以先寫入txt再看哪些還不符合xml規范
'file = ThisWorkbook.Path & "\test.txt"
'Open file For Output As #1
'Print #1, sXML
'Close #1
Set nodelist = xDoc.SelectNodes("//P")
Set node = xDoc.SelectSingleNode("//P")
'Debug.Print nodelist.Length
For Each Item In nodelist
Debug.Print Item.Text
Next
End Sub

Function rr(str As String, pattern As String, repstr As String)
Set reg = CreateObject("vbscript.regexp")
With reg
.Global = True
.pattern = pattern
End With
rr = reg.Replace(str, repstr)
End Function

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM