直接上腳本
# -*- coding: utf-8 -*- from elasticsearch import Elasticsearch # 日志的配置環境 import platform import logging.handlers sys_platform = platform.system() if sys_platform == "Windows": LOG_FILE_check = './app_cic.txt' else: LOG_FILE_check = '/cic1.log' handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=1200 * 1024 * 1024,backupCount=10) # 實例化handler 200M 最多十個文件 fmt = '\n' + '%(message)s' formatter = logging.Formatter(fmt) # 實例化formatter handler.setFormatter(formatter) # 為handler添加formatter logger = logging.getLogger('check') # 獲取名為tst的logger logger.addHandler(handler) # 為logger添加handler logger.setLevel(logging.DEBUG) # es = Elasticsearch() es = Elasticsearch(["20.0.0.11:9200"], sniff_on_start=True, sniff_on_connection_fail=True,sniff_timeout=60) import time query_json = { "query":{ "terms":{ "site":[ "百度搜索" ] } } } page_num = 1000 # 每次獲取數據 query = es.search(index='guoyan_index_v1', body=query_json, scroll='5m', size=page_num) results = query['hits']['hits'] # es查詢出的結果第一頁 total = query['hits']['total'] # es查詢出的結果總量 scroll_id = query['_scroll_id'] # 游標用於輸出es查詢出的所有結果 every_num = int(total/page_num) # # print(results) print("total",total) print("scroll_id",scroll_id) print("every_num",every_num) alist = [] end_data_list = [] print("----------",int(total/page_num)+1) for i in range(0, every_num): # for i in range(100, 1000): print("正在讀取的位置是:",i) results_list = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits'] for key in results_list: try: source = key['_source']["source"] other6 = key['_source']["other6"] result_str = other6 + " " + source end_data_list.append(result_str) except: pass end_data_list = list(set(end_data_list)) print("去重以后的數據是條數是:",len(end_data_list)) for end_data in end_data_list: logger.info(end_data)