之前做命名實體識別地址時,因為麗江很多地名比較奇怪,不能直接用pyltp提取,准備添加自定義字典,增加地址提取准確率。
地址數據源:
http://poi.mapbar.com/lijiang/
這里以麗江為例,其他地方的地名爬取原理一樣的。
獲取地址分類:
// 當前頁面地址:http://poi.mapbar.com/lijiang/901/
// css選擇器
$(".sortBox a")
// init(211) [a#520, a#530, a#541, a
$(".sortBox a")[0].innerHTML
// "超市"
$(".sortBox a")[0].href
// "http://poi.mapbar.com/lijiang/520/"
提取地名:
// 提取某個分類下所有地名標簽
// css選擇器
$(".sortC a")
// init(328) [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, …]
// 獲取標簽文字
$(".sortC a")[0].innerHTML
// "愛尚里"
$(".sortC a")[1].innerHTML
// "八河"
$(".sortC a")[1].href
// "http://poi.mapbar.com/lijiang/MAPIJPHRCNHOFNHIJNTRC"
示例代碼:
import requests
from bs4 import BeautifulSoup
class AddressType:
def __init__(self):
self.type = ''
self.url = ''
class Address:
def __init__(self):
self.name = ''
self.type = ''
self.url = ''
self.location = ''
self.phone = ''
# 獲取地址類型
def get_address_type_url(url):
address_type_list = []
try:
data = requests.get(url).text
xml_data = BeautifulSoup(data, "lxml")
address_type_content = xml_data.select(".sortBox a")
for item in address_type_content:
address_type = AddressType()
address_type.type = item.get_text()
address_type.url = item.get('href')
address_type_list.append(address_type)
except Exception as ex:
print(ex)
finally:
return address_type_list
# 獲取某個類型下所有地名
def get_address_name(url):
try:
data = requests.get(url).text
xml_data = BeautifulSoup(data, "lxml")
address_content = xml_data.select(".sortC a")
for item in address_content:
print(item.get_text())
except Exception as ex:
print(ex)
address_type = get_address_type_url('http://poi.mapbar.com/lijiang/980/')
for item in address_type:
print(item.type,item.url)
get_address_name(item.url)
Github地址:
https://github.com/haibincoder/AddressCrawer