一般来说,ES的search接口最多返回1w条数据,当数据量很大又不方便修改参数的时候,可以使用scroll的方式获取。
import requests
from easydict import EasyDict
from elasticsearch import Elasticsearch, helpers
ES_SERVER=[{'host': '127.0.0.1', 'port': 1234}]
ES_CLIENT = Elasticsearch(hosts=ES_SERVER)
def get_data(data_id):
result = helpers.scan(
client=ES_CLIENT,
query={"query": {"match": {"id": data_id}}},
scroll=u'5m',
index='index',
timeout='1m',
doc_type='index'
) # 方式1,通过python的ES客户端
return [_result['_source'] for _result in result]
def get_data_by_api(data_id):
"""方式2,通过ES接口"""
post_data = {
"password": "******",
"method": "GET",
"url": "index/_search?scroll=1m",
"json": """{"query":{"match":{"id":"%s"}},"size": 10000}""" % indicator_id
}
result = list()
try:
rsp = requests.post(ES_SERVER_API, data=post_data)
rsp = EasyDict(rsp.json())
post_data.update(url="/_search/scroll")
except ValueError:
raise EnvironmentError('ES 未能成功返回数据!')
while True:
result += rsp.hits.hits
scroll_id = rsp.get('_scroll_id')
if len(rsp.hits.hits) == 0:
break
post_data.update(json="""{"scroll_id": "%s", "scroll": "1m"}""" % scroll_id)
rsp = request.post_json(ES_SERVER_API, data=post_data)
return [_result['_source'] for _result in result]