class Crawler_Method { public static Dictionary<String, String> GETCity() { String html=GET("https://www.zhaopin.com/citymap.html");//調用網絡請求函數 return Parse1(html);//調用解析函數解析網頁得到數據 } public static Dictionary<String, String> Parse1(String html) { Dictionary<String, String> map = new Dictionary<string, string>(); Document doc = NSoup.NSoupClient.Parse(html);//將網頁返回的數據用Nsoup初始化為document文檔進行結構初始化 Elements elements = doc.GetElementsByClass("col1");//得到屬性class為col1的元素 Elements e = elements[0].Select("a");//得到tag為a的元素 for (int x=0;x<e.Count;x++)//對得到的多個a 進行循環獲得數據 { Element a = e[x]; if(a.Attr("href").Substring(2).StartsWith("www"))//得到目標數據 map.Add(a.Text(), "https://" + a.Attr("href").Substring(2)); } return map; } public static Dictionary<String, String> GetCompany(String url) { String html = GET(url); return Parse2(html); } public static Dictionary<String, String> Parse2(String html)//此方法注釋與上同 { Dictionary<String, String> map = new Dictionary<string, string>(); Document doc = NSoup.NSoupClient.Parse(html); Elements elements = doc.GetElementsByClass("nctt"); Elements e = elements[0].Select("li"); for (int x = 0; x < e.Count; x++) { Element a = e[x]; map.Add(a.Select("a")[0].Text(),a.Select("a")[0].Attr("href")); } return map; } public static String GET(String share_url) { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(share_url);//通過網頁鏈接對網絡請求類進行初始化 request.Method = "GET";//設置請求方式為get request.AllowAutoRedirect = true;//允許網頁重定向 // request.Headers.Set("Content-Range", " bytes 0 - 126399 / 8065760"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"; //反爬蟲的設置 設置瀏覽器標識 HttpWebResponse response = (HttpWebResponse)request.GetResponse();//得到網頁返回reponse對象 //Console.WriteLine(response.Headers.ToString()); Stream stream = response.GetResponseStream();//得到網頁輸出流 StreamReader read = new StreamReader(stream, System.Text.Encoding.GetEncoding("utf-8"));//對返回的數據進行解碼 String nextline = ""; String html = ""; while ((nextline = read.ReadLine()) != null)//不斷地讀取輸入流,讀取網頁源碼 { html += nextline; } read.Close(); return html;//返回網頁源碼 } }
