python爬蟲
1 利用pip引入相關庫
from bs4 import BeautifulSoup
import requests
2 構建請求
以某網站為例,此為post請求,根據目標網站而定
headers = {'Host': 'xxxxx',
'Origin':'xxxxx',
'Content-Type':'xxxx',
'Origin': 'xxxx',
'Referer': ',
'User-Agent': 'xxxxxxx',
'X-Requested-With': 'xxxxx'
}
content = {'m': 'xflist',
'city': 'wf',
'district': ''}
url='xxxxx';
res = requests.post(url, data=content, headers=headers)
3 解析網頁結果
soup = BeautifulSoup(html, "html.parser")
list = soup.select('body > div.main > section.mBox.mb8.dtbk > div > ul ')
for i in list:
lilist = i.find_all("li")
for j in lilist:
create_time = j.select("div.time")[0].getText()
print(create_time)
content = j.select("h4")[0].getText()
print(content)
json = {"createTime": create_time,"content":content}
jsonArray.append(json)
經python爬蟲爬出數據,包含時間格式截取部分如下
json={
"floor_area": "57535 ㎡",
"building_area": "250000 ㎡",
"volume_rate": "3.78",
"greening_rate": "30%",
"parking_rate": "項目規划車位數量為1889個",
"record": [
{
"createTime": "2018-11-20 11:03:33",
"content": "新瑞都"
},
{
"createTime": "2020-12-31",
"content": "3號樓"
},
{
"createTime": "2018-11-03",
"content": "3號樓"
},
}
4引入elasticsearch庫,確保elasticsearch為啟動狀態
from e import Elasticsearch
from fang import make_request1
es = Elasticsearch([{'host':'127.0.0.1','port':9200}])
json = make_request1()
es.index(index="house_test",doc_type="fang",body = json)
運行后報錯信息如下

顯然雖然json 爬取為字符串,但是上傳到elasticsearch,被識別為日期格式
於是elasticsearch創建索引時即規定type.
5 elasticsearch創建索引,具體實現代碼如下
from elasticsearch import Elasticsearch
es = Elasticsearch('192.168.1.1:9200')
mappings = {
"mappings": {
"fang": {
"properties": {
},"open_time": {
"type": "keyword",
"index": "false"
},"volume_rate": {
"type": "keyword",
"index": "false"
},"greening_rate": {
"type": "keyword",
"index": "false"
},"parking_rate": {
"type": "keyword",
"index": "false"
},"house_type": {
"type": "keyword",
"index": "false"
},
"property_company": {
"type": "keyword",
"index": "false"
},
# tags可以存json格式,訪問tags.content
"projectComment": {
"type": "object",
"properties": {
"createTime": {"type": "keyword", "index": False},
"content": {"type": "keyword", "index": False},
}
},
}
}
}
}
res = es.indices.create(index = 'index_test',body =mappings)
將數據連續插入
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
es = Elasticsearch('192.168.1.1:9200')
ACTIONS = []
json1 ={
"floor_area": "57535 ㎡",
"building_area": "250000 ㎡",
"volume_rate": "3.78",
"greening_rate": "30%",
"parking_rate": "項目規划車位數量為1889個",
"record": [
{
"createTime": "2018-11-20 11:03:33",
"content": "新瑞都"
},
{
"createTime": "2020-12-31",
"content": "3號樓"
},
{
"createTime": "2018-11-03",
"content": "3號樓"
},
}
json2 ={
"floor_area": "354345 ㎡",
"building_area": "234500 ㎡",
"volume_rate": "453",
"greening_rate": "43%",
"parking_rate": "項目規划車位數量為1889個",
"record": [
{
"createTime": "2018-11-20 11:03:33",
"content": "新瑞都"
},
{
"createTime": "2020-12-31",
"content": "3號樓"
},
{
"createTime": "2018-11-03",
"content": "3號樓"
},
}
}
ACTIONS.append(json1)
ACTIONS.append(json2)
res,_ =bulk(es, ACTIONS, index="indes_test", raise_on_error=True)
5 查詢索引
通過postman發送請求

查詢到結果

