爬一下国家统计局行政区划代码C#


目前NBS上有2015-2018四个年度的代码信息,写一个控制台程序爬一下县级行政区下的代码。

使用HttpWebRequest+HttpWebResponse获取html,使用HtmlAgilityPack类库解析HTML。

使用POST请求,请求头带Cookie信息,否则会被反爬机制挡死,返回“请开启JavaScript并刷新该页”。

县级URL Request获取数据的同时记录Response的Cookie信息,在请求镇级数据时,请求头发送此cookie。

省-地-县-乡 ”与“省-县(地)-乡” 的URL长度不同,根据长度判断URL正确性时需注意,也许还有其他可能,暂未发现。

主方法

  1  class Program
  2     {
  3         static void Main(string[] args)
  4         {
  5             Console.ForegroundColor = ConsoleColor.Magenta;
  6             Console.WriteLine("\r\n----获取县级行政区乡、村二级区划代码");
  7             Console.WriteLine("----数据年份有:");
  8             Console.ResetColor();
  9             Cursor.WriteAt("A、2018", 2, 0);
 10             Cursor.WriteAt("B、2017", 12, 0);
 11             Cursor.WriteAt("C、2016", 2, 1);
 12             Cursor.WriteAt("D、2015", 12, 1);
 13             Input: Console.ForegroundColor = ConsoleColor.Magenta;
 14             Console.WriteLine();
 15             Console.WriteLine("----请输入一个年份代码(回车提交):");
 16             Console.ResetColor();
 17             char chr = Convert.ToChar( Console.ReadLine().ToLower()[0]);
 18             if ((int)chr >= 97 &&(int)chr <= 100)
 19             {
 20                 string year = string.Empty;
 21                 switch (chr)
 22                 {
 23                     case 'a':
 24                         year = "2018"; break;
 25                     case 'b':
 26                         year = "2017"; break;
 27                     case 'c':
 28                         year = "2016"; break;
 29                     default:
 30                         year = "2015"; break;
 31                 }
 32                 System.Diagnostics.Process.Start($"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{year}");
 33                 Console.ForegroundColor = ConsoleColor.Magenta;
 34                 Console.WriteLine("浏览器已加载区划代码起始页,请进入县级行政单位页面,复制url,粘贴到下面(回车提交):");
 35             }
 36             else
 37                 goto Input;
 38             Console.ResetColor();
 39             string cityurl = Console.ReadLine();
 40             if (cityurl.Length != 66&& cityurl.Length!=71)
 41             {
 42                 Console.ForegroundColor = ConsoleColor.Magenta;
 43                 Console.WriteLine("url有误,请确认是县级行政单位页面,重新复制链接,粘贴到下面:");
 44                 Console.ResetColor();
 45                 cityurl = Console.ReadLine();
 46             }
 47             try
 48             {
 49                 Console.ForegroundColor = ConsoleColor.Magenta;
 50                 Func<object, List<TownInfo>> func = new Func<object, List<TownInfo>>(GetTownInfos);
 51                 Task<List<TownInfo>> task = new Task<List<TownInfo>>(func, cityurl);
 52                 task.Start();
 53                 task.Wait();
 54                 if (task.Status == TaskStatus.RanToCompletion && task.Result.Count > 0)
 55                 {
 56 
 57                     List<VillageInfo> villageInfos = new List<VillageInfo>();
 58                     foreach (var item in task.Result)
 59                     {
 60                         //把乡镇信息写入村级列表,实现乡镇信息输出
 61                         VillageInfo villageInfo_town = new VillageInfo(item.Code, "", item.Name);
 62                         villageInfos.Add(villageInfo_town);
 63                         Func<object, List<VillageInfo>> func1 = new Func<object, List<VillageInfo>>(GetVillageInfos);
 64                         Task<List<VillageInfo>> task1 = new Task<List<VillageInfo>>(func1, item.Href);
 65                         task1.Start();
 66                         task1.Wait();
 67                         if (task1.Status == TaskStatus.RanToCompletion)
 68                         {
 69                             villageInfos.AddRange(task1.Result);
 70                         }
 71                     }
 72                     foreach (var item1 in villageInfos)
 73                     {
 74                         Console.WriteLine($"{item1.Name.Trim()}\t{item1.Cls.Trim()}\t{item1.Code.Trim()}");
 75                     }
 76                 }
 77                 else
 78                 { Console.WriteLine("乡镇列表获取失败!"); }
 79 
 80             }
 81             catch (Exception)
 82             {
 83                 throw new Exception("");
 84             }
 85             Console.ReadKey();
 86         }
 87         static string cookies = "AD_RS_COOKIE=20082854; wzws_cid=453a2d88181321410de83ba7eedaba3a141eb61ee7488027b6ab07a66054605e99e886827afa72708ce170398ea2fdfeec55455a7c0be8e779694026255f2166";
 88         //获取乡镇级信息列表
 89         static List<TownInfo> GetTownInfos(object cityurl)
 90         {
 91             List<TownInfo> townInfos = new List<TownInfo>();
 92             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url =(string) cityurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936),RequestMethod="post"};
 93             //HtmlAgilityPack类库解析HTML
 94             HtmlDocument document = new HtmlDocument();
 95             document.LoadHtml(httpGetHelper.GetHtml(1,ref cookies));
 96             //string html = httpGetHelper.GetHtml(ref cookies);
 97             //路径里"//"表示从根节点开始查找,两个斜杠‘//’表示查找所有childnodes;一个斜杠'/'表示只查找第一层的childnodes(即不查找grandchild);点斜杠"./"表示从当前结点而不是根结点开始查找
 98             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='towntr']");
 99             foreach (var node in htmlNodes)
100             {
101                 HtmlNodeCollection htmlNodes1 = node.SelectNodes("./td");
102                 HtmlNode htmlNodeHref = node.SelectSingleNode(".//a[@href]");
103                 HtmlAttribute htmlAttribute = htmlNodeHref.Attributes["href"];
104                 TownInfo townInfo = new TownInfo(htmlNodes1[0].InnerText, htmlNodes1[1].InnerText,
105                     (cityurl as string).Substring(0, (cityurl as string).LastIndexOf('/') + 1) + htmlAttribute.Value);
106                 townInfos.Add(townInfo);
107             }
108             return townInfos;
109         }
110         //获取村级信息列表
111         static List<VillageInfo> GetVillageInfos(object townurl)
112         {
113             List<VillageInfo> villageInfos = new List<VillageInfo>();
114             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url = (string)townurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936), RequestMethod = "post"};
115             HtmlDocument document = new HtmlDocument();
116             document.LoadHtml(httpGetHelper.GetHtml(2,ref cookies));
117             //string html = httpGetHelper.GetHtml(ref cookies);
118             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='villagetr']");
119             foreach (var node in htmlNodes)
120             {
121                 HtmlNodeCollection htmlNodes1 = node.SelectNodes(".//td");
122                 VillageInfo villageInfo = new VillageInfo(htmlNodes1[0].InnerText,htmlNodes1[1].InnerText,htmlNodes1[2].InnerText);
123                 villageInfos.Add(villageInfo);
124             }
125             return villageInfos;
126         }
127     }

 

辅助类/结构

 

 1   internal class Cursor
 2     {
 3         const int origRow = 3;
 4         const int origCol = 0;
 5         public static void WriteAt(string s, int c, int r)
 6         {
 7             Console.SetCursorPosition(origCol + c, origRow + r);
 8             Console.Write(s);
 9         }
10     }
11     //乡镇信息结构 编码、名称、超链
12     struct TownInfo
13     {
14         string code;
15         public string Code{ get { return code; } }
16         string name;
17         public string Name{get { return name; } }
18         string href;
19         public string Href { get { return href; } }
20         public TownInfo (string code,string name,string href)
21         {
22             this.code = code;
23             this.name = name;
24             this.href = href;
25         }
26     }
27     //村信息结构 编码、城乡划分类,名称
28     struct VillageInfo
29     {
30         string code;
31         public string Code{ get { return code; } }
32         string  cls;
33         public string Cls{ get { return cls; } }
34         string name;
35         public string Name{ get { return name; } }
36         public VillageInfo(string code,string cls,string name)
37         {
38             this.code = code;
39             this.cls = cls;
40             this.name = name;
41         }
42     }

 

获取HTML

 1     public class HttpGetHelper
 2     {
 3         string url = string.Empty;
 4         public string Url
 5         {
 6             set { url = value; }
 7         }
 8 
 9         int timeOut=10*1000;
10         public int Timeout
11         {
12             set { timeOut = value; }
13         }
14 
15         string contentType= "text/html;charset=utf-8";
16         public string ContentType
17         {
18             set { contentType = value; }
19         }
20 
21         string userAgent= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 ";
22         public string UserAgent
23         {
24             set { userAgent = value; }
25         }
26 
27         Encoding encode=Encoding.UTF8;
28         public Encoding Encode
29         {
30             set { encode = value; }
31         }
32         string request_Method = "get";
33         public string RequestMethod
34         {
35             set { request_Method = value; }
36         }
37         /// <summary>
38         /// get html content
39         /// </summary>
40         /// <param name="cls">town=1;village=2</param>
41         /// <param name="cookies">if cls=1 then ref cookies</param>
42         /// <returns></returns>
43         public string GetHtml(int cls,ref string cookies)
44         {
45             string html = string.Empty;
46             try
47             {
48                 if (url!=string.Empty)
49                 {
50                     HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
51                     request.Timeout = this.timeOut;
52                     request.ContentType = this.contentType;
53                     request.UserAgent = this.userAgent;
54                     request.Headers.Add(HttpRequestHeader.Cookie, cookies);
55                     request.Method = request_Method;
56                     using (HttpWebResponse response =request.GetResponse()as HttpWebResponse)
57                     {
58                         if (response.StatusCode==HttpStatusCode.OK)
59                         {//如果是县级url,则记录cookie
60                             if (cls==1)
61                             {
62                                 CookieCollection cookieCollection = response.Cookies;
63                                 foreach (Cookie item in cookieCollection)
64                                 {
65                                     cookies = item.Name + "=" + item.Value + ";";
66                                 }
67                                 cookies.Remove(cookies.Length - 1);
68                             }
69 
70                             using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), encode))
71                             {
72                                 html = streamReader.ReadToEnd();
73                                 streamReader.Close();
74                             }
75                         }
76                     }
77                 }
78             }
79             catch (Exception)
80             {
81                 throw new Exception($"GetHtml失败,url:{url}");
82             }
83             return html;
84         }
85     }

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM