C#使用phantomjs,爬取AJAX加載完成之后的頁面


 1、開發思路:入參根據apiSetting配置文件,分配靜態文件存儲地址,可實現不同站點的靜態頁生成功能。靜態頁生成功能使用無頭瀏覽器生成,生成之后的字符串進行正則替換為固定地址,實現本地正常訪問。

2、已發現問題:如果js在載入頁面時進行某些重寫dom操作,已用正則替換掉的動態路徑代碼,會被覆蓋,導致本地訪問無效。 這一點只能是站點開發那邊重新對頁面進行優化,從而避免這種情況。 但是這僅影響本地情況,如果靜態頁面部署到服務器,使用相對路徑其實也不會影響。

 

using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;

namespace QuartZNetService.Controllers
{
    public class BuildStaticController : Controller
    {
        /// <summary>
        /// 配置地址
        /// </summary>
        public static string jsonUrl = AppDomain.CurrentDomain.BaseDirectory + "apiSetting.json";
        
        /// <summary>
        /// 網站配置類
        /// </summary>
        public class HttpConfig
        {
            /// <summary>
            /// 網站cookie信息
            /// </summary>
            public string Cookie { get; set; }

            /// <summary>
            /// 頁面Referer信息
            /// </summary>
            public string Referer { get; set; }

            /// <summary>
            /// 默認(text/html)
            /// </summary>
            public string ContentType { get; set; }

            public string Accept { get; set; }

            public string AcceptEncoding { get; set; }

            /// <summary>
            /// 超時時間(毫秒)默認100000
            /// </summary>
            public int Timeout { get; set; }

            public string UserAgent { get; set; }

            /// <summary>
            /// POST請求時,數據是否進行gzip壓縮
            /// </summary>
            public bool GZipCompress { get; set; }

            public bool KeepAlive { get; set; }

            public string CharacterSet { get; set; }

            public HttpConfig()
            {
                this.Timeout = 100000;
                this.ContentType = "text/html; charset=" + Encoding.UTF8.WebName;

                this.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36";
                this.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                this.AcceptEncoding = "gzip,deflate";
                this.GZipCompress = false;
                this.KeepAlive = true;
                this.CharacterSet = "UTF-8";
            }
        }

        /// <summary>
        /// 利用phantomjs 爬取AJAX加載完成之后的頁面
        /// JS腳本刷新時間間隔為3秒,防止頁面AJAX請求時間過長導致數據無法獲取
        /// </summary>
        /// <param name="url"></param>
        /// <param name="sitId">站點ID 用於配置站點盤符位置</param>
        /// <param name="type">存儲文件夾 可為空</param>
        /// <param name="config"></param>
        /// <param name="interval"></param>
        /// <returns></returns>
        public JsonResult Do(string url, string sitId, string typeId, string fileName, HttpConfig config, int interval = 3000)
        {
            try
            {
                var readjson = Readjson(sitId, typeId);
                JObject jo = (JObject)JsonConvert.DeserializeObject(readjson.ToString());
                var sitUrl = jo["url"].ToString();
                var folder = jo["folder"].ToString();

                string path = System.AppDomain.CurrentDomain.BaseDirectory.ToString();
                ProcessStartInfo start = new ProcessStartInfo(path + @"webTools\phantomjs.exe");//設置運行的命令行文件問ping.exe文件,這個文件系統會自己找到 
                start.WorkingDirectory = path + @"webTools\";

                //設置命令參數
                string commond = string.Format("{0} {1} {2} {3} {4} {5}", path + @"webTools\codes.js", url, interval, config.UserAgent, config.Accept, config.Referer);
                start.Arguments = commond;
                StringBuilder sb = new StringBuilder();
                start.CreateNoWindow = true;//不顯示dos命令行窗口 
                start.RedirectStandardOutput = true;// 
                start.RedirectStandardInput = true;// 
                start.UseShellExecute = false;//是否指定操作系統外殼進程啟動程序 
                Process p = Process.Start(start);
                StreamReader reader = new StreamReader(p.StandardOutput.BaseStream,Encoding.UTF8);//截取輸出流    

                //正則匹配完整外鏈js
                Regex myreg = new Regex("(http|https)://(?<domain>[^(:|/]*)");
                Match myMatch = myreg.Match(url);
                var reader_txt = reader.ReadToEnd();
                StringBuilder reader_write = new StringBuilder(reader_txt);
                Regex regex = new Regex("<script[^>]*?src=\"([^>]*?)\"[^>]*?>", RegexOptions.IgnoreCase);//正則匹配外鏈html代碼
                MatchCollection userMatchColl = regex.Matches(reader_txt);

                //自定義替換區域 bg
                if (userMatchColl.Count > 0)
                {
                    foreach (Match matchItem in userMatchColl)
                    {
                        if (reader_write.ToString().IndexOf(matchItem.Value) > 0 && matchItem.Value.IndexOf("xxx.cn") == -1)
                        {
                            reader_write.Insert(
                                (reader_write.ToString().IndexOf(matchItem.Value) + matchItem.Value.IndexOf("src=\"") + ("src=\"").Length),
                                "https://www.xxx.cn"
                                );
                        }
                    }
                }
                reader_write.Replace("src=\"//", "src=\"https://");//增加https
                reader_write.Replace("href=\"//", "href=\"https://");//增加https
                reader_write.Replace("\"//images", "\"https://images");//增加https
                //自定義替換區域 end

                StreamWriter write = new StreamWriter(sitUrl + folder + "//" + fileName, false, Encoding.UTF8);//寫入文件
                write.Write(reader_write);
                write.Flush();
                write.Close();
                p.WaitForExit();//等待程序執行完退出進程 
                p.Close();//關閉進程  
                reader.Close();//關閉流 
                return Json(true, JsonRequestBehavior.AllowGet);
            }
            catch (Exception ex)
            {
                return Json(ex.Message, JsonRequestBehavior.AllowGet);
            }
        }

        /// <summary>
        /// 讀取配置文件
        /// </summary>
        /// <param name="sitId"></param>
        /// <param name="typeId"></param>
        /// <returns></returns>
        public static string Readjson(string sitId, string typeId)
        {
            string url = "";
            string folder = "";
            using (System.IO.StreamReader file = System.IO.File.OpenText(jsonUrl))
            {
                using (JsonTextReader reader = new JsonTextReader(file))
                {
                    JObject JObject = (JObject)JToken.ReadFrom(reader);
                    //取站點路徑
                    var sit = JObject["sit"];
                    foreach (JObject item in sit)
                    {
                        if (item["sitId"].ToString() == sitId)
                        {
                            url = item["sitUrl"].ToString();
                        }
                    }
                    //取文件夾名稱 可為空
                    var type = JObject["type"];
                    foreach (JObject item in type)
                    {
                        if (item["typeId"].ToString() == typeId)
                        {
                            folder = item["folder"].ToString();
                        }
                    }
                }
            }
            return JsonConvert.SerializeObject(new
            {
                url = url,
                folder = folder
            });
        }
    }
}

 

codes.js 配置 

var page = require('webpage').create(), system = require('system');
var url = system.args[1];
var interval = system.args[2];
var settings = {
    timeout: interval,
    encoding: "UTF-8",
    operation: "GET",
    headers: {
        "User-Agent": system.args[3],
        "Accept": system.args[4],
        "Accept-Language": "zh-CN,en;q=0.7,en-US;q=0.3",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": 1,
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
        "Referer": system.args[5]
    }
}
page.settings = settings;
page.open(url, function (status) {
    phantom.outputEncoding = "UTF-8";
    if (status !== 'success') {
        console.log('Unable to post!');
        phantom.exit();
    } else {
        setTimeout(function () {
            console.log(page.content);
            phantom.exit();
        }, interval);
    }
});

 

apiSetting.json 配置

{
    "sit": [
        {
            "sitId": "1",
            "sitUrl": "D://"
        },
        {
            "sitId": "60",
            "sitUrl": "D://"
        }
    ],
    "type": [
    {
        "typeId": "1",
        "folder": "zmPC"
    },
    {
        "typeId": "60",
        "folder": "zmCP"    
    }
]
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM