還是處理視頻下載所相關的問題。
有些網站,它的頁面代碼是由頁面加載后js動態生成,那么其原始的html便不能用。頁面渲染后的代碼,是我們需要的
c#中,我用WebBrowser這個控件處理。設置項目類型為控制台程序,加Form承載WebBrowser實現。
記錄代碼以做備忘:
using System; using System.IO; using System.Net; using System.Runtime.InteropServices; using System.Text; using System.Windows.Forms; using Microsoft.Win32; namespace crpj { [ComVisible(true)] public class Form : System.Windows.Forms.Form { protected override void SetVisibleCore(bool value) { base.SetVisibleCore(false); } public string GetHtmlCode(string url) { using (var wc = new WebClient()) { wc.Encoding = Encoding.UTF8; return wc.DownloadString(url); } } } class Program { private static Timer tmrGet = new Timer(); private static Timer tmrExit = new Timer(); private static WebBrowser browser = new WebBrowser(); //延時獲取? private static int delay = 0; //js注入腳本 private static string jsCode; //禁止網頁跳轉聲音 const int FEATURE_DISABLE_NAVIGATION_SOUNDS = 21; const int SET_FEATURE_ON_PROCESS = 0x00000002; [DllImport("urlmon.dll")] [PreserveSig] [return: MarshalAs(UnmanagedType.Error)] static extern int CoInternetSetFeatureEnabled( int FeatureEntry, [MarshalAs(UnmanagedType.U4)] int dwFlags, bool fEnable); /// <summary> /// 應用程序的主入口點。 /// </summary> /// 參數列表:url delay jscode [STAThread] static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("error: You must provide at least one URL."); return; } CoInternetSetFeatureEnabled( FEATURE_DISABLE_NAVIGATION_SOUNDS, SET_FEATURE_ON_PROCESS, true); ChackAndSetBrowserEmulation(); var form = new Form(); form.Controls.Add(browser); browser.ObjectForScripting = form; browser.ScriptErrorsSuppressed = true; browser.DocumentCompleted += browser_DocumentCompleted; browser.Navigate(args[0]); if (args.Length > 1) delay = int.Parse(args[1]); if (args.Length > 2) jsCode = args[2]; //因為頁面有時需加載js初始化等操作,延時獲取其頁面內容 tmrGet.Tick += new EventHandler(tmrGet_Tick); if (delay > 0) tmrGet.Interval = delay; //有些網頁不觸發complete事件,或者時間很長,此定時器做判斷,以60秒為界,自結束 tmrExit.Tick += new EventHandler(tmrExit_Tick); tmrExit.Interval = 90000; tmrExit.Start(); Application.Run(form); } static void tmrExit_Tick(object sender, EventArgs e) { OutputHtml(); } //WebBrowser以IE11版本做頁面渲染 static void ChackAndSetBrowserEmulation() { try { string keyName = @"SOFTWARE\Microsoft\Internet Explorer\MAIN\FeatureControl\FEATURE_BROWSER_EMULATION"; using (var key = Registry.CurrentUser.OpenSubKey(keyName, true)) { string valueName = Path.GetFileName(Application.ExecutablePath); if (key.GetValue(valueName) == null) key.SetValue(valueName, 11001); } } catch { } } static void tmrGet_Tick(object sender, EventArgs e) { tmrGet.Stop(); OutputHtml(); } static void OutputHtml() { tmrExit.Stop(); //避免韓文等亂碼 Console.OutputEncoding = Encoding.UTF8; //browser.DocumentText取不到執行js之后的body文件 string html = browser.Document.GetElementsByTagName("html")[0].OuterHtml; Console.Write(html); Application.Exit(); } static void ExecJS(string jsCode) { var script = browser.Document.CreateElement("script"); script.SetAttribute("type", "text/javascript"); script.SetAttribute("text", "function _func() {" + jsCode + "}"); browser.Document.Body.AppendChild(script); browser.Document.InvokeScript("_func"); } static void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { if (browser.ReadyState == WebBrowserReadyState.Complete && e.Url == browser.Url) { //是否需要js注入? if (!string.IsNullOrEmpty(jsCode)) { ExecJS(jsCode); System.Threading.Thread.Sleep(500); } if (delay == 0) OutputHtml(); else tmrGet.Start(); } } } }
如此處理,可能得到所需要的html代碼。
其在控制台輸出圖示效果:
並基於此思路,設計進程輸出管理器:
internal class ProcessOutputMgr { private static object syncObj = new Object(); private Process process = new Process(); private StringBuilder allData = new StringBuilder(); private bool exitedCalled = false; public ProcessMgr(string fileName, string args) { var startInfo = new ProcessStartInfo(fileName); startInfo.WindowStyle = ProcessWindowStyle.Hidden; startInfo.Arguments = args; startInfo.UseShellExecute = false; startInfo.CreateNoWindow = true;
//crpj皆以utf-8輸出,避免亂碼
startInfo.StandardOutputEncoding = Encoding.UTF8; startInfo.RedirectStandardOutput = true; startInfo.RedirectStandardError = true; process.StartInfo = startInfo; process.EnableRaisingEvents = true; //一定要有這個才能觸發Exited 事件 process.Exited += process_Exited; process.OutputDataReceived += process_OutputDataReceived; process.ErrorDataReceived += process_ErrorDataReceived; } public event DataReceivedEventHandler OutputDataReceived; public event DataReceivedEventHandler ErrorDataReceived; public event Action<string> AllDataReceived; public bool Start() { bool result = process.Start(); process.BeginOutputReadLine(); process.BeginErrorReadLine(); return result; } public void WaitForExit() { process.WaitForExit(); } public bool WaitForExit(int milliseconds) { return process.WaitForExit(milliseconds); } private void process_Exited(object sender, EventArgs e) { if (!this.exitedCalled && this.allData.Length != 0) { this.exitedCalled = true; var handler = AllDataReceived; if (handler != null) handler(this.allData.ToString()); } } private void process_OutputDataReceived(object sender, DataReceivedEventArgs e) { lock (syncObj) { var handler = OutputDataReceived; if (handler != null) handler(sender, e); if (e.Data != null) this.allData.AppendLine(e.Data); else { var process = sender as Process; if (process.HasExited && !this.exitedCalled) { this.exitedCalled = true; if (AllDataReceived != null) AllDataReceived(this.addData.ToString()); } } } } private void process_ErrorDataReceived(object sender, DataReceivedEventArgs e) { lock (syncObj) { var handler = ErrorDataReceived; if (handler != null) handler(sender, e); } } }