python 爬蟲 上傳elasticSearch (包括日期)


python爬蟲

1 利用pip引入相關庫

from bs4 import BeautifulSoup
import requests

2 構建請求

以某網站為例,此為post請求,根據目標網站而定


      headers = {'Host': 'xxxxx',
               'Origin':'xxxxx',
               'Content-Type':'xxxx',
               'Origin': 'xxxx',
               'Referer': ',
               'User-Agent': 'xxxxxxx',
               'X-Requested-With': 'xxxxx'
                   }

     content = {'m': 'xflist',
                   'city': 'wf',
                   'district': ''}

      url='xxxxx';

     res = requests.post(url, data=content, headers=headers)

3 解析網頁結果


      soup = BeautifulSoup(html, "html.parser")
      list = soup.select('body > div.main > section.mBox.mb8.dtbk > div > ul ')
      for i in list:
      lilist = i.find_all("li")
        for j in lilist:
            create_time = j.select("div.time")[0].getText()
            print(create_time)
            content = j.select("h4")[0].getText()
            print(content)
            json = {"createTime": create_time,"content":content}
            jsonArray.append(json)

經python爬蟲爬出數據,包含時間格式截取部分如下


json={
               "floor_area": "57535 ㎡",
               "building_area": "250000 ㎡",
               "volume_rate": "3.78",
               "greening_rate": "30%",
               "parking_rate": "項目規划車位數量為1889個",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3號樓"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3號樓"
                        },
                }

4引入elasticsearch庫,確保elasticsearch為啟動狀態

from e import Elasticsearch
from fang import make_request1


es = Elasticsearch([{'host':'127.0.0.1','port':9200}])
json = make_request1()

es.index(index="house_test",doc_type="fang",body = json)

運行后報錯信息如下

顯然雖然json 爬取為字符串,但是上傳到elasticsearch,被識別為日期格式

於是elasticsearch創建索引時即規定type.

5 elasticsearch創建索引,具體實現代碼如下


from elasticsearch import Elasticsearch
es = Elasticsearch('192.168.1.1:9200')

mappings = {
            "mappings": {
                "fang": {
                    "properties": {
                       
                         },"open_time": {
                            "type": "keyword",
                            "index": "false"
                        },"volume_rate": {
                            "type": "keyword",
                            "index": "false"
                        },"greening_rate": {
                            "type": "keyword",
                            "index": "false"
                        },"parking_rate": {
                            "type": "keyword",
                            "index": "false"
                         },"house_type": {
                            "type": "keyword",
                            "index": "false"
                         },
                        "property_company": {
                            "type": "keyword",
                            "index": "false"
                         },
                        # tags可以存json格式,訪問tags.content
                        "projectComment": {
                            "type": "object",
                            "properties": {
                                "createTime": {"type": "keyword", "index": False},
                                "content": {"type": "keyword", "index": False},
                            }
                        },
                    }
                }
            }
        }



res = es.indices.create(index = 'index_test',body =mappings)

將數據連續插入

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch('192.168.1.1:9200')

ACTIONS = []

json1 ={
                "floor_area": "57535 ㎡",
               "building_area": "250000 ㎡",
               "volume_rate": "3.78",
               "greening_rate": "30%",
               "parking_rate": "項目規划車位數量為1889個",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3號樓"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3號樓"
                        },
                }
json2 ={
               "floor_area": "354345 ㎡",
               "building_area": "234500 ㎡",
               "volume_rate": "453",
               "greening_rate": "43%",
               "parking_rate": "項目規划車位數量為1889個",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3號樓"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3號樓"
                        },
                    }
                }

ACTIONS.append(json1)
ACTIONS.append(json2)

res,_ =bulk(es, ACTIONS, index="indes_test", raise_on_error=True)


5 查詢索引

通過postman發送請求

查詢到結果


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM