前言
在ceph的实际应用中,我们常常发现存在数据分布不均衡的情况,这个情况在存储池水位达到百分之八十以上后犹为明显。
在ceph出现了数据分布不均衡的时候,我们可以选择人工干预,通过调整pg分布来 达到数据(大致)均衡。
从Luminous v12.2.z开始,OSDMap中有一个新的pg-upmap异常表,允许集群将特定的PG显式映射到特定的OSD。本文记录一次该方案的实践。
实践
1.问题描述
单机ceph环境,正常存储数据,当数据存储达到86%时,发现集群状态error,告警有1个osd满了。具体的一下信息如下:
- 环境ceph版本
[root@node1 ~]# ceph version
ceph version 12.2.8 (ae699615bac534ea496ee965ac6192cb7e0e07c0) luminous (stable)
[root@node1 ~]#
- 环境ceph状态
[root@node1 ~]# ceph -s
cluster:
id: 3825da34-808e-48e3-865d-15e14ab46d66
health: HEALTH_ERR
1 full osd(s)
7 pool(s) full
services:
mon: 1 daemons, quorum node1
mgr: node1(active)
mds: infinityfs1-1/1/1 up {0=node1=up:active}
osd: 14 osds: 14 up, 14 in
rgw: 1 daemon active
data:
pools: 9 pools, 680 pgs
objects: 2.23M objects, 12.3TiB
usage: 18.5TiB used, 3.64TiB / 22.1TiB avail
pgs: 680 active+clean
[root@node1 ~]#
- osd的容量信息如下,发现osd.5使用率达到99%,而同一存储池使用的其他osd的使用率最少的使用才70%几,所以采用人工干预,使其均衡。
[root@node1 ~]# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS
3 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 129
0 hdd 1.81898 1.00000 1.82TiB 1.59TiB 236GiB 87.32 1.05 149
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.59TiB 237GiB 87.26 1.04 127
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.03 135
5 hdd 1.81898 1.00000 1.82TiB 1.80TiB 18.6GiB 99.00 1.18 149
6 hdd 1.81898 1.00000 1.82TiB 1.59TiB 239GiB 87.19 1.04 151
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 287GiB 84.59 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.32TiB 508GiB 72.73 0.87 121
9 hdd 1.81898 1.00000 1.82TiB 1.44TiB 387GiB 79.24 0.95 135
11 hdd 1.81898 1.00000 1.82TiB 1.37TiB 457GiB 75.46 0.90 131
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
0 hdd 1.81898 1.00000 1.82TiB 1.59TiB 236GiB 87.32 1.05 149
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.59TiB 237GiB 87.26 1.04 127
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.03 135
5 hdd 1.81898 1.00000 1.82TiB 1.80TiB 18.6GiB 99.00 1.18 149
6 hdd 1.81898 1.00000 1.82TiB 1.59TiB 239GiB 87.19 1.04 151
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 287GiB 84.59 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.32TiB 508GiB 72.73 0.87 121
9 hdd 1.81898 1.00000 1.82TiB 1.44TiB 387GiB 79.24 0.95 135
11 hdd 1.81898 1.00000 1.82TiB 1.37TiB 457GiB 75.46 0.90 131
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
3 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 129
TOTAL 22.1TiB 18.5TiB 3.64TiB 83.55
MIN/MAX VAR: 0.01/1.18 STDDEV: 31.87
[root@node1 ~]#
2.操作步骤
2.1设置集群仅支持 Luminous(或者L之后的)客户端
具体命令:
ceph osd set-require-min-compat-client luminous
或
ceph osd set-require-min-compat-client luminous --yes-i-really-mean-it
形如:
[root@node1 h]# ceph osd set-require-min-compat-client luminous
set require_min_compat_client to luminous
[root@node1 h]#
[root@node1 h]#
[root@node1 h]# ceph features
{
"mon": {
"group": {
"features": "0x3ffddff8eea4fffb",
"release": "luminous",
"num": 1
}
},
"mds": {
"group": {
"features": "0x3ffddff8eea4fffb",
"release": "luminous",
"num": 1
}
},
"osd": {
"group": {
"features": "0x3ffddff8eea4fffb",
"release": "luminous",
"num": 14
}
},
"client": {
"group": {
"features": "0x3ffddff8eea4fffb",
"release": "luminous",
"num": 11
}
}
}
[root@node1 h]#
2.2获取集群当前的osdmap信息
命令
ceph osd getmap {osdmap_filename}
形如
[root@node1 h]# ceph osd getmap -o osd.map
got osdmap epoch 130
[root@node1 h]#
[root@node1 h]# ls
osd.map
[root@node1 h]#
2.3获取当前集群数据均衡后的优化信息
命令
osdmaptool {osdmap_filename} --upmap out.txt [--upmap-pool <pool>] [--upmap-max <max-count>] [--upmap-deviation <max-deviation>]
其中
upmap-pool :指定需要优化均衡的存储池名
upmap-max: 指定一次优化的数据条目,默认100,可根据环境业务情况调整该值,一次调整的条目越多,数据迁移会越多,可能对环境业务造成影响。
max-deviation:最大偏差值,默认为0.01(即1%)。如果OSD利用率与平均值之间的差异小于此值,则将被视为完美。
假设只优化2条,即:
[root@node1 h]#
[root@node1 h]# osdmaptool osd.map --upmap out.txt --upmap-pool filepool --upmap-max=2
osdmaptool: osdmap file 'osd.map'
writing upmap command output to: out.txt
checking for upmap cleanups
upmap, max-count 2, max deviation 0.01
limiting to pools filepool (1)
//查看需要优化的信息
[root@node1 h]# cat out.txt
ceph osd pg-upmap-items 1.1 5 8
ceph osd pg-upmap-items 1.2 0 8 5 11
[root@node1 h]#
已默认100条输出如下:
[root@node1 h]#
[root@node1 h]# osdmaptool osd.map --upmap out.txt --upmap-pool filepool
osdmaptool: osdmap file 'osd.map'
writing upmap command output to: out.txt
checking for upmap cleanups
upmap, max-count 100, max deviation 0.01
limiting to pools filepool (1)
[root@node1 h]#
[root@node1 h]# cat out.txt
ceph osd pg-upmap-items 1.1 5 8
ceph osd pg-upmap-items 1.2 0 8 5 11
ceph osd pg-upmap-items 1.8 5 8
ceph osd pg-upmap-items 1.10 5 8 6 11
ceph osd pg-upmap-items 1.15 5 11
ceph osd pg-upmap-items 1.18 5 8 2 9
ceph osd pg-upmap-items 1.1f 5 8
ceph osd pg-upmap-items 1.20 5 11
ceph osd pg-upmap-items 1.21 5 9
ceph osd pg-upmap-items 1.22 5 8
[root@node1 h]#
[root@node1 h]#
从上述数据结果中
ceph osd pg-upmap-items 1.1 5 8
表明pg 1.1需从osd.5重新映射到osd.8
ceph osd pg-upmap-items 1.2 0 8 5 11
表明pg 1.2的osd集合中osd.0重映射到osd.8 ,osd.5重映射到osd.11
2.4均衡
运行数据分布调整操作,这一步将会使集群开始进行PG重新映射,同时集群数据开始迁移均衡
命令:
source out.txt
形如:
[root@node1 h]#
[root@node1 h]# source out.txt
set 1.1 pg_upmap_items mapping to [5->8]
set 1.2 pg_upmap_items mapping to [0->8,5->11]
set 1.8 pg_upmap_items mapping to [5->8]
set 1.10 pg_upmap_items mapping to [5->8,6->11]
set 1.15 pg_upmap_items mapping to [5->11]
set 1.18 pg_upmap_items mapping to [5->8,2->9]
set 1.1f pg_upmap_items mapping to [5->8]
set 1.20 pg_upmap_items mapping to [5->11]
set 1.21 pg_upmap_items mapping to [5->9]
set 1.22 pg_upmap_items mapping to [5->8]
[root@node1 h]#
[root@node1 h]# ceph -s
cluster:
id: 3825da34-808e-48e3-865d-15e14ab46d66
health: HEALTH_ERR
1 full osd(s)
7 pool(s) full
113103/6686299 objects misplaced (1.692%)
services:
mon: 1 daemons, quorum node1
mgr: node1(active)
mds: infinityfs1-1/1/1 up {0=node1=up:active}
osd: 14 osds: 14 up, 14 in; 10 remapped pgs
rgw: 1 daemon active
data:
pools: 9 pools, 680 pgs
objects: 2.23M objects, 12.3TiB
usage: 18.5TiB used, 3.64TiB / 22.1TiB avail
pgs: 113103/6686299 objects misplaced (1.692%)
670 active+clean
8 active+remapped+backfill_wait
2 active+remapped+backfilling
io:
client: 14.6GiB/s rd, 8.17GiB/s wr, 5.41kop/s rd, 13.11kop/s wr
recovery: 57.7MiB/s, 9objects/s
[root@node1 h]#
等到数据迁移完成后,再来观察数据的使用率,发现osd的使用率大都均衡在83%左右,较调整前,更均衡。
调整后:
[root@node1 ~]#
[root@node1 ~]# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS
3 ssd 0.13599 1.00000 140GiB 1.01GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.01GiB 139GiB 0.72 0.01 129
0 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.00 1.03 148
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.57TiB 259GiB 86.07 1.03 126
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.02 135
5 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.91 1.03 139
6 hdd 1.81898 1.00000 1.82TiB 1.56TiB 263GiB 85.87 1.03 150
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 289GiB 84.49 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.49TiB 333GiB 82.13 0.98 128
9 hdd 1.81898 1.00000 1.82TiB 1.49TiB 336GiB 81.97 0.98 137
11 hdd 1.81898 1.00000 1.82TiB 1.47TiB 361GiB 80.61 0.96 135
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
0 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.00 1.03 148
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.57TiB 259GiB 86.07 1.03 126
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.02 135
5 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.91 1.03 139
6 hdd 1.81898 1.00000 1.82TiB 1.56TiB 263GiB 85.87 1.03 150
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 289GiB 84.49 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.49TiB 333GiB 82.13 0.98 128
9 hdd 1.81898 1.00000 1.82TiB 1.49TiB 336GiB 81.97 0.98 137
11 hdd 1.81898 1.00000 1.82TiB 1.47TiB 361GiB 80.61 0.96 135
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
3 ssd 0.13599 1.00000 140GiB 1.01GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.01GiB 139GiB 0.72 0.01 129
TOTAL 22.1TiB 18.5TiB 3.63TiB 83.57
MIN/MAX VAR: 0.01/1.03 STDDEV: 31.38
[root@node1 ~]#
调整前:
[root@node1 ~]# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS
3 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 129
0 hdd 1.81898 1.00000 1.82TiB 1.59TiB 236GiB 87.32 1.05 149
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.59TiB 237GiB 87.26 1.04 127
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.03 135
5 hdd 1.81898 1.00000 1.82TiB 1.80TiB 18.6GiB 99.00 1.18 149
6 hdd 1.81898 1.00000 1.82TiB 1.59TiB 239GiB 87.19 1.04 151
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 287GiB 84.59 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.32TiB 508GiB 72.73 0.87 121
9 hdd 1.81898 1.00000 1.82TiB 1.44TiB 387GiB 79.24 0.95 135
11 hdd 1.81898 1.00000 1.82TiB 1.37TiB 457GiB 75.46 0.90 131
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
0 hdd 1.81898 1.00000 1.82TiB 1.59TiB 236GiB 87.32 1.05 149
1 hdd 1.81898 1.00000 1.82TiB 1.54TiB 286GiB 84.65 1.01 130
2 hdd 1.81898 1.00000 1.82TiB 1.59TiB 237GiB 87.26 1.04 127
4 hdd 1.81898 1.00000 1.82TiB 1.56TiB 267GiB 85.65 1.03 135
5 hdd 1.81898 1.00000 1.82TiB 1.80TiB 18.6GiB 99.00 1.18 149
6 hdd 1.81898 1.00000 1.82TiB 1.59TiB 239GiB 87.19 1.04 151
7 hdd 1.81898 1.00000 1.82TiB 1.54TiB 287GiB 84.59 1.01 129
8 hdd 1.81898 1.00000 1.82TiB 1.32TiB 508GiB 72.73 0.87 121
9 hdd 1.81898 1.00000 1.82TiB 1.44TiB 387GiB 79.24 0.95 135
11 hdd 1.81898 1.00000 1.82TiB 1.37TiB 457GiB 75.46 0.90 131
12 hdd 1.81898 1.00000 1.82TiB 1.56TiB 261GiB 86.01 1.03 152
13 hdd 1.81898 1.00000 1.82TiB 1.56TiB 262GiB 85.94 1.03 146
3 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 128
10 ssd 0.13599 1.00000 140GiB 1.00GiB 139GiB 0.72 0.01 129
TOTAL 22.1TiB 18.5TiB 3.64TiB 83.55
MIN/MAX VAR: 0.01/1.18 STDDEV: 31.87
[root@node1 ~]#
3.结束语
更多详情参见
ceph官网