es 6.2.4版本
logstash跑了一阵子之后不再同步数据了,日志信息如下:
[2019-06-19T10:30:28,379][INFO ][logstash.outputs.elasticsearch] retrying failed action with response code: 403 ({"type"=>"cluster_block_exception", "reason"=>"blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];"})
[2019-06-19T10:30:28,379][INFO ][logstash.outputs.elasticsearch] Retrying individual bulk actions that failed or were rejected by the previous bulk request. {:count=>125}
检查elasticsearch日志如下:(日志没贴全,应该有超过flood_stage阈值的告警,因为logstash日志里已经提示索引只读了...可是日志被我删掉了...)
[2019-06-19T10:30:26,746][WARN ][o.e.c.r.a.DiskThresholdMonitor] [node-1] high disk watermark [90%] exceeded on [sPv40vq_RKanFIUuBgJuUQ][node-2][/usr/elasticsearch/data/nodes/0] free: 657.4mb[8%], shards will be relocated away from this node
[2019-06-19T10:30:26,746][INFO ][o.e.c.r.a.DiskThresholdMonitor] [node-1] low disk watermark [85%] exceeded on [DJl3qtK5Twmpi-MGNuujog][node-3][/usr/elasticsearch/data/nodes/0] free: 1gb[13.4%], replicas will not be assigned to this node
[2019-06-19T10:30:56,752][WARN ][o.e.c.r.a.DiskThresholdMonitor] [node-1] high disk watermark [90%] exceeded on [sPv40vq_RKanFIUuBgJuUQ][node-2][/usr/elasticsearch/data/nodes/0] free: 657.4mb[8%], shards will be relocated away from this node
[2019-06-19T10:30:56,752][INFO ][o.e.c.r.a.DiskThresholdMonitor] [node-1] low disk watermark [85%] exceeded on [DJl3qtK5Twmpi-MGNuujog][node-3][/usr/elasticsearch/data/nodes/0] free: 1gb[13.4%], replicas will not be assigned to this node
[2019-06-19T10:30:56,752][INFO ][o.e.c.r.a.DiskThresholdMonitor] [node-1] rerouting shards: [high disk watermark exceeded on one or more nodes]
查看代码org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor#onNewInfo
,结合官网文档 Disk-based Shard Allocationedit,可以知道es会对磁盘空间进行监控,当磁盘空间使用量达到一定的阈值就会做不同的处理。
这里其实是磁盘剩余空间达到了floodstage阈值,导致Elasticsearch对每个索引强制执行只读索引块,所以logstash在做数据同步的时候就报错了。
部分源码如下:
public void onNewInfo(ClusterInfo info) {
ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
if (usages != null) {
boolean reroute = false;
String explanation = "";
// Garbage collect nodes that have been removed from the cluster
// from the map that tracks watermark crossing
ObjectLookupContainer<String> nodes = usages.keys();
for (String node : nodeHasPassedWatermark) {
if (nodes.contains(node) == false) {
nodeHasPassedWatermark.remove(node);
}
}
ClusterState state = clusterStateSupplier.get();
Set<String> indicesToMarkReadOnly = new HashSet<>();
for (ObjectObjectCursor<String, DiskUsage> entry : usages) {
String node = entry.key;
DiskUsage usage = entry.value;
//检测磁盘空间使用量,当达到不同阈值时给出告警或者info信息
warnAboutDiskIfNeeded(usage);
//磁盘使用量达到floodstage阈值,将所有索引都标记为只读
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() ||
usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
RoutingNode routingNode = state.getRoutingNodes().node(node);
if (routingNode != null) { // this might happen if we haven't got the full cluster-state yet?!
for (ShardRouting routing : routingNode) {
indicesToMarkReadOnly.add(routing.index().getName());
}
}
}
//磁盘使用量达到高阈值,超过重新分配分片的间隔时间则重新分配
else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() ||
usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
if ((System.nanoTime() - lastRunNS) > diskThresholdSettings.getRerouteInterval().nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "high disk watermark exceeded on one or more nodes";
} else {
logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred " +
"in the last [{}], skipping reroute",
node, diskThresholdSettings.getRerouteInterval());
}
nodeHasPassedWatermark.add(node);
}
//磁盘使用量达到低阈值
else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() ||
usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
nodeHasPassedWatermark.add(node);
}
//磁盘使用量未达到任何阈值,如果之前达到了高阈值或低阈值,重新路由,以便能够分配任何未分配的分片
else {
if (nodeHasPassedWatermark.contains(node)) {
// The node has previously been over the high or
// low watermark, but is no longer, so we should
// reroute so any unassigned shards can be allocated
// if they are able to be
if ((System.nanoTime() - lastRunNS) > diskThresholdSettings.getRerouteInterval().nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "one or more nodes has gone under the high or low watermark";
nodeHasPassedWatermark.remove(node);
} else {
logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred " +
"in the last [{}], skipping reroute",
node, diskThresholdSettings.getRerouteInterval());
}
}
}
}
if (reroute) {
logger.info("rerouting shards: [{}]", explanation);
reroute();
}
indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
if (indicesToMarkReadOnly.isEmpty() == false) {
markIndicesReadOnly(indicesToMarkReadOnly);
}
}
}
/**
* Warn about the given disk usage if the low or high watermark has been passed
*/
private void warnAboutDiskIfNeeded(DiskUsage usage) {
//检查磁盘剩余使用量
// Check absolute disk values
// 剩余磁盘使用量 < cluster.routing.allocation.disk.watermark.flood_stage
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes()) {
logger.warn("flood stage disk watermark [{}] exceeded on {}, all indices on this node will marked read-only",
diskThresholdSettings.getFreeBytesThresholdFloodStage(), usage);
}
// 剩余磁盘使用量 < cluster.routing.allocation.disk.watermark.high
else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
diskThresholdSettings.getFreeBytesThresholdHigh(), usage);
}
// 剩余磁盘使用量 < cluster.routing.allocation.disk.watermark.low
else if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes()) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
diskThresholdSettings.getFreeBytesThresholdLow(), usage);
}
//检查磁盘使用百分比
// Check percentage disk values
// 剩余磁盘百分比 < 100 - 95(cluster.routing.allocation.disk.watermark.flood_stage)
if (usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
logger.warn("flood stage disk watermark [{}] exceeded on {}, all indices on this node will marked read-only",
Strings.format1Decimals(100.0 - diskThresholdSettings.getFreeDiskThresholdFloodStage(), "%"), usage);
}
// 剩余磁盘百分比 < 100 - 90(cluster.routing.allocation.disk.watermark.high)
else if (usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
Strings.format1Decimals(100.0 - diskThresholdSettings.getFreeDiskThresholdHigh(), "%"), usage);
}
// 剩余磁盘百分比 < 100 - 85(cluster.routing.allocation.disk.watermark.low)
else if (usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
Strings.format1Decimals(100.0 - diskThresholdSettings.getFreeDiskThresholdLow(), "%"), usage);
}
}
这里涉及几个配置:
-
cluster.routing.allocation.disk.threshold_enabled
是否开启基于磁盘的分片分配,默认true
-
cluster.routing.allocation.disk.watermark.low
控制磁盘空间使用的低水位线,默认85%
,es不会再将分片分配给磁盘使用超过这个配置的节点。
这个设置不会影响新创建的索引的主分片,或者是之前从未分配过的任何分片
-
cluster.routing.allocation.disk.watermark.high
控制磁盘空间使用的高水位线,默认90%
,es会将磁盘使用超过这个配置的节点中的分片重新进行分配。
这个设置将影响所有分片的分配,不管分片之前是否已经被分配过
-
cluster.routing.allocation.disk.watermark.flood_stage
控制磁盘空间使用的洪水水位线,默认95%
,es会将磁盘使用超过这个配置的节点中的所有索引都标记为只读。
这是防止节点耗尽磁盘空间的最后手段。一旦有足够的磁盘空间允许继续索引操作,需要手动释放索引块
-
cluster.routing.allocation.disk.include_relocations
当计算一个节点的剩余磁盘空间时,是否考虑正在重新分配到当前节点的分片容量,默认true
。
这可能导致错误的高估一个磁盘的使用率。因为分片重分配可能已经完成了90%,检索到的磁盘使用率包含了这个重新分配的分片总大小以及这已经分配了的90%进度的大小。
-
cluster.routing.allocation.disk.reroute_interval
分片重分配间隔,默认60秒
-
cluster.info.update.interval
磁盘使用率检查间隔,默认30秒
关于配置的几点说明:
- 上面几个配置要么都设置为百分比,要么都设置为具体的字节值,不能混用。
- 可以通过在配置文件
elasticsearch.yml
中配置,也可以在 cluster-update-settings API 在实时群集上动态更新。直接参考官网文档即可。
测试
-- 添加文档,自动创建索引
curl http://172.16.22.51:9200/idx_luoluocaihong/_doc/1 -X PUT -H 'Content-Type:application/json' -d '{"user":"luoluocaihong","age":"20"}'
{"_index":"idx_luoluocaihong","_type":"_doc","_id":"1","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
-- 设置只读索引块
curl http://172.16.22.51:9200/idx_luoluocaihong/_settings -X PUT -H 'Content-Type:application/json' -d '{"index.blocks.read_only_allow_delete": true}'
{"acknowledged":true}
-- 查看索引的设置
curl http://172.16.22.51:9200/idx_luoluocaihong/_settings
{"idx_luoluocaihong":{"settings":{"index":{"number_of_shards":"5","blocks":{"read_only_allow_delete":"true"},"provided_name":"idx_luoluocaihong","creation_date":"1561107195032","number_of_replicas":"1","uuid":"3iS68s1nQMudxhyL-zNnRg","version":{"created":"6020499"}}}}}
-- 添加文档
curl http://172.16.22.51:9200/idx_luoluocaihong/_doc/2 -X PUT -H 'Content-Type:application/json' -d '{"user":"user_2","age":"20"}'
{"error":{"root_cause":[{"type":"cluster_block_exception","reason":"blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];"}],"type":"cluster_block_exception","reason":"blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];"},"status":403}
-- 重置只读索引块
curl http://172.16.22.51:9200/idx_luoluocaihong/_settings -X PUT -H 'Content-Type:application/json' -d '{"index.blocks.read_only_allow_delete": null}'
{"acknowledged":true}
-- 查看索引的设置
curl http://172.16.22.51:9200/idx_luoluocaihong/_settings
{"idx_luoluocaihong":{"settings":{"index":{"creation_date":"1561107195032","number_of_shards":"5","number_of_replicas":"1","uuid":"3iS68s1nQMudxhyL-zNnRg","version":{"created":"6020499"},"provided_name":"idx_luoluocaihong"}}}}
-- 添加文档
curl http://172.16.22.51:9200/idx_luoluocaihong/_doc/2 -X PUT -H 'Content-Type:application/json' -d '{"user":"user_2","age":"20"}'
{"_index":"idx_luoluocaihong","_type":"_doc","_id":"2","_version":1,"result":"created","_shards":{"total":2,"successful":2,"failed":0},"_seq_no":0,"_primary_term":1}
-- 搜索
curl http://172.16.22.51:9200/idx_luoluocaihong/_search?q=age:20
{"took":19,"timed_out":false,"_shards":{"total":5,"successful":5,"skipped":0,"failed":0},"hits":{"total":2,"max_score":0.2876821,"hits":[{"_index":"idx_luoluocaihong","_type":"_doc","_id":"2","_score":0.2876821,"_source":{"user":"user_2","age":"20"}},{"_index":"idx_luoluocaihong","_type":"_doc","_id":"1","_score":0.2876821,"_source":{"user":"luoluocaihong","age":"20"}}]}}
-- 查看集群设置
curl 172.16.22.51:9200/_cluster/settings
-- 修改集群设置
curl 172.16.22.51:9200/_cluster/settings -X PUT -H 'Content-Type:application/json' -d '{"transient":{"cluster.routing.allocation.disk.watermark.low":"80%","cluster.routing.allocation.disk.watermark.high":"85%","cluster.routing.allocation.disk.watermark.flood_stage":"90%"}}'
{"acknowledged":true,"persistent":{},"transient":{"cluster":{"routing":{"allocation":{"disk":{"watermark":{"low":"80%","flood_stage":"90%","high":"85%"}}}}}}}
-- 检查索引状况
curl http://172.16.22.51:9200/_cat/indices
green open idx_luoluocaihong 3iS68s1nQMudxhyL-zNnRg 5 1 2 0 17.4kb 8.7kb
-- 检查es集群健康状况
curl 172.16.22.51:9200/_cluster/health?pretty
{
"cluster_name" : "iot-es",
"status" : "green",
"timed_out" : false,
"number_of_nodes" : 3,
"number_of_data_nodes" : 3,
"active_primary_shards" : 15,
"active_shards" : 30,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 0,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 100.0
}
-- 删除索引
curl http://172.16.22.51:9200/idx_luoluocaihong -X DELETE