阿里雲提供的地理信息接口
https://datav.aliyun.com/tools/atlas/
有兩個接口, 一個是[行政編碼].json, 一個是[行政編碼]_full.json, 從接口中可以提取到區縣一級的行政區划信息. 提取的過程中遇到的一些問題:
- 從[行政編碼].json中讀取的信息中, 可能parent = null, 出現這種情況的大都是一些撤縣改區的節點, 要將其設為上一級節點的行政編碼
- 從[行政編碼].json中讀到的parent的adcode, 可能與[父節點行政編碼]_full.json中讀到的parent的adcode不一致, 例如從110000_full.json中得到的節點列表, 其parent都是110000, 但是在取其字節點110101.json時會發現, parent變成了110100, 這時候要使用110100這個行政編碼
- 因為從上至下遍歷時, 是不會遇到110100這個節點的, 所以在遍歷的過程中, 要檢查是否出現了未知的行政編碼, 如果有, 需要額外讀取並入庫
- 有部分節點, 其json無法讀取(不存在), 例如密雲110118.json, 延慶110119.json, 這時候要用前一步得到的信息入庫
使用生成的行政區划數據時, 對於香港澳門的數據, 因為沒有level=city的這一級, 所以需要特殊處理一下, 例如在讀取province這一級的子節點時, 如果發現沒有level=city的節點, 那么就返回一個虛擬的節點, 這個節點各字段值和自己一樣, 但是level=city.
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import json
import traceback
import rbcommon
def readRegion(adcode, parent_code = None):
# https://geo.datav.aliyun.com/areas/bound/140000.json
url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '.json'
print(url)
echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
if echo is None:
print('URL request failed: ' + url)
return
elif echo.find('<?') == 0:
print('Not found: ' + url)
return
# print(echo)
json_obj = json.loads(echo)
region = {}
region['name'] = json_obj['features'][0]['properties']['name']
region['adcode'] = json_obj['features'][0]['properties']['adcode']
region['telecode'] = json_obj['features'][0]['properties']['telecode']
level = json_obj['features'][0]['properties']['level']
if (level == 'country'):
region['level'] = 0
elif (level == 'province'):
region['level'] = 1
elif (level == 'city'):
region['level'] = 2
elif (level == 'district'):
region['level'] = 3
if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None):
region['parent'] = json_obj['features'][0]['properties']['parent']['adcode']
else:
region['parent'] = parent_code
# read sub regions
sub_regions = []
region['children'] = sub_regions
# https://geo.datav.aliyun.com/areas/bound/140000_full.json
url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '_full.json'
print(url)
echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
if echo is None:
print('URL request failed: ' + url)
return region
elif echo.find('<?') == 0:
print('Not found: ' + url)
return region
# print(echo)
json_obj = json.loads(echo)
sub_objs = json_obj['features']
for sub_obj in sub_objs:
sub_region = {}
sub_region['adcode'] = (str)(sub_obj['properties']['adcode'])
if (sub_region['adcode'] == region['adcode']):
continue
sub_region['name'] = sub_obj['properties']['name']
sub_region['telecode'] = None
level = sub_obj['properties']['level']
if (level == 'country'):
sub_region['level'] = 0
elif (level == 'province'):
sub_region['level'] = 1
elif (level == 'city'):
sub_region['level'] = 2
elif (level == 'district'):
sub_region['level'] = 3
sub_region['parent'] = adcode
sub_regions.append(sub_region)
# further check if the parent adcode is correct
if (len(sub_regions) > 0):
# https://geo.datav.aliyun.com/areas/bound/140000.json
url = 'https://geo.datav.aliyun.com/areas/bound/' + sub_regions[0]['adcode'] + '.json'
# print(url)
echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
if echo is None:
print('URL request failed: ' + url)
elif echo.find('<?') == 0:
print('Not found: ' + url)
else:
json_obj = json.loads(echo)
if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None):
dummy_parent = json_obj['features'][0]['properties']['parent']['adcode']
if (dummy_parent != sub_regions[0]['parent']):
print('Update parent from {} to {}', sub_regions[0]['parent'], dummy_parent)
for sub_region in sub_regions:
sub_region['parent'] = dummy_parent
return region
def readAllRegion(parent_region):
region = readRegion(parent_region['adcode'], parent_region['parent'])
if not region is None:
if (not region['parent'] is None) and (not region['parent'] in regions):
new_region = readRegion(region['parent'], parent_region['parent'])
if not new_region is None:
regions.add(new_region['adcode'])
insert(new_region)
regions.add(region['adcode'])
insert(region)
for sub_region in region['children']:
readAllRegion(sub_region)
else:
regions.add(parent_region['adcode'])
insert(parent_region)
def insert(region):
try:
with rbcommon.mysqlclient.cursor() as cursor:
sql = 'INSERT IGNORE INTO `s_region` (`id`, `parent_id`, `level`, `name`, `tele_code`, `short_name`, ' \
'`full_name`) VALUES (%s, %s, %s, %s, %s, %s, %s)'
cursor.execute(sql, (
region['adcode'],
None if (not 'parent' in region) else region['parent'],
region['level'],
region['name'],
region['telecode'],
region['name'],
'{}'))
rbcommon.mysqlclient.commit()
except Exception as e:
print(json.dumps(region))
traceback.print_exc()
### MAIN ###
regions = set()
region = readRegion('100000')
readAllRegion(region)
其中rbcommon.mysqlclient的初始化方法
mysqlclient = pymysql.connect(
host=cfg['mysql']['host'],
port=cfg['mysql']['port'],
user=cfg['mysql']['user'],
password=cfg['mysql']['password'],
db=cfg['mysql']['db'],
charset=cfg['mysql']['charset'],
cursorclass=pymysql.cursors.DictCursor)
