1.skywalking 实现收集基于虚拟机环境 dubbo微服务链路跟踪案例
准备三台服务器
172.20.20.212 duboo-server1
172.20.20.213 zookeeper1
172.20.20.214 dubbo-client
1.1 部署zookeeper1
#安装java环境
apt install openjdk-11-jdk
#安装zookeeper
wget https://archive.apache.org/dist/zookeeper/zookeeper-3.6.4/apache-zookeeper-3.6.4-bin.tar.gz
tar xvf apache-zookeeper-3.6.4-bin.tar.gz -C /apps/
#修改配置文件
cp /apps/apache-zookeeper-3.6.4-bin/conf/zoo_sample.cfg /apps/apache-zookeeper-3.6.4-bin/conf/zoo.cfg
mkdir -p /data/zookeeper
# cat /apps/apache-zookeeper-3.6.4-bin/conf/zoo.cfg |grep -v "#"
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/data/zookeeper
clientPort=2181
#启动zookeeper
/apps/apache-zookeeper-3.6.4-bin/bin/zkServer.sh start
#检查
# ss -nptl
State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
...
LISTEN 0 50 *:8080 *:* users:(("java",pid=83126,fd=46))
LISTEN 0 50 *:36353 *:* users:(("java",pid=83126,fd=44))
LISTEN 0 50 *:2181 *:* users:(("java",pid=83126,fd=54))
1.2 部署dubbo-server1
#安装java环境
apt install openjdk-8-jdk
#安装skywalking-agent
wget https://archive.apache.org/dist/skywalking/java-agent/8.8.0/apache-skywalking-java-agent-8.8.0.tgz
tar -xf apache-skywalking-java-agent-8.8.0.tgz -C /apps/
vim /apps/skywalking-agent/config/agent.config
18 agent.namespace=${SW_AGENT_NAMESPACE:dubbo } #自定义namespace,类似于项目名称
21 agent.service_name=${SW_AGENT_NAME:dubbo-server1} #自定义service 名称,类似于具体某个微服务的服务名称
93 collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:172.20.20.211:11800} #指定skywalking地址:gRPC数据端口
#上传dubbo-server.jar
mv dubbo-server.jar /apps/
#修改hosts文件(可以直接修改jar包里的配置文件,将访问zookeeper的地址从域名改为ip地址,就不用修改hosts文件)
echo "172.20.20.213 zookeeper1" >> /etc/hosts
#启动
java -javaagent:/apps/skywalking-agent/skywalking-agent.jar -jar /apps/dubbo-server.jar
1.3 部署dubbo-client1
#安装java环境
apt install openjdk-8-jdk
#安装skywalking-agent
wget https://archive.apache.org/dist/skywalking/java-agent/8.8.0/apache-skywalking-java-agent-8.8.0.tgz
tar -xf apache-skywalking-java-agent-8.8.0.tgz -C /apps/
vim /apps/skywalking-agent/config/agent.config
18 agent.namespace=${SW_AGENT_NAMESPACE:dubbo } #自定义namespace,类似于项目名称
21 agent.service_name=${SW_AGENT_NAME:dubbo-client1} #自定义service 名称,类似于具体某个微服务的服务名称
93 collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:172.20.20.211:11800} #指定skywalking地址:gRPC数据端口
#上传dubbo-client.jar
mv dubbo-client.jar /apps/
#修改hosts文件(可以直接修改jar包里的配置文件,将访问zookeeper的地址从域名改为ip地址,就不用修改hosts文件)
echo "172.20.20.213 zookeeper1" >> /etc/hosts
echo "172.20.20.213 zookeeper2" >> /etc/hosts
echo "172.20.20.213 zookeeper3" >> /etc/hosts
#启动
java -javaagent:/apps/skywalking-agent/skywalking-agent.jar -jar /apps/dubbo-client.jar
1.4 查看zookeeper注册信息
1.5 验证dubbo-client页面
访问http://172.20.20.214:8080/hello
通过url传参数http://172.20.20.214:8080/hello?name=Tom
1.6 查看skywalking验证
2.skywalking 实现收集kubernetes环境dubbo微服务链路跟踪案例
2.1 部署zookeeper
#准备pv
# cat zookeeper-persistentvolume.yaml
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: zookeeper-datadir-pv-1
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
nfs:
server: 172.20.20.81
path: /data/k8sdata/web/zookeeper-datadir-1
mountOptions:
- nfsvers=3
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: zookeeper-datadir-pv-2
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
nfs:
server: 172.20.20.81
path: /data/k8sdata/web/zookeeper-datadir-2
mountOptions:
- nfsvers=3
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: zookeeper-datadir-pv-3
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
nfs:
server: 172.20.20.81
path: /data/k8sdata/web/zookeeper-datadir-3
mountOptions:
- nfsvers=3
#执行
# kubectl apply -f zookeeper-persistentvolume.yaml
#查看
# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
zookeeper-datadir-pv-1 10Gi RWO Retain Bound web/zookeeper-datadir-pvc-1 5d22h
zookeeper-datadir-pv-2 10Gi RWO Retain Bound web/zookeeper-datadir-pvc-2 5d22h
zookeeper-datadir-pv-3 10Gi RWO Retain Bound web/zookeeper-datadir-pvc-3 5d22h
#准备pvc
# cat zookeeper-persistentvolumeclaim.yaml
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: zookeeper-datadir-pvc-1
namespace: web
spec:
accessModes:
- ReadWriteOnce
volumeName: zookeeper-datadir-pv-1
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: zookeeper-datadir-pvc-2
namespace: web
spec:
accessModes:
- ReadWriteOnce
volumeName: zookeeper-datadir-pv-2
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: zookeeper-datadir-pvc-3
namespace: web
spec:
accessModes:
- ReadWriteOnce
volumeName: zookeeper-datadir-pv-3
resources:
requests:
storage: 10Gi
#执行
# kubectl apply -f zookeeper-persistentvolumeclaim.yaml
#查看
# kubectl get pvc -n web
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
zookeeper-datadir-pvc-1 Bound zookeeper-datadir-pv-1 10Gi RWO 5d22h
zookeeper-datadir-pvc-2 Bound zookeeper-datadir-pv-2 10Gi RWO 5d22h
zookeeper-datadir-pvc-3 Bound zookeeper-datadir-pv-3 10Gi RWO 5d22h
#准备创建zookeeper
# cat zookeeper.yaml
apiVersion: v1
kind: Service
metadata:
name: zookeeper
namespace: web
spec:
ports:
- name: client
port: 2181
selector:
app: zookeeper
---
apiVersion: v1
kind: Service
metadata:
name: zookeeper1
namespace: web
spec:
type: NodePort
ports:
- name: client
port: 2181
nodePort: 32181
- name: followers
port: 2888
- name: election
port: 3888
selector:
app: zookeeper
server-id: "1"
---
apiVersion: v1
kind: Service
metadata:
name: zookeeper2
namespace: web
spec:
type: NodePort
ports:
- name: client
port: 2181
nodePort: 32182
- name: followers
port: 2888
- name: election
port: 3888
selector:
app: zookeeper
server-id: "2"
---
apiVersion: v1
kind: Service
metadata:
name: zookeeper3
namespace: web
spec:
type: NodePort
ports:
- name: client
port: 2181
nodePort: 32183
- name: followers
port: 2888
- name: election
port: 3888
selector:
app: zookeeper
server-id: "3"
---
kind: Deployment
apiVersion: apps/v1
metadata:
name: zookeeper1
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: zookeeper
template:
metadata:
labels:
app: zookeeper
server-id: "1"
spec:
volumes:
- name: data
emptyDir: {}
- name: wal
emptyDir: {}
containers:
- name: server
image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/zookeeper:v3.4.14
imagePullPolicy: Always
env:
- name: MYID
value: "1"
- name: SERVERS
value: "zookeeper1,zookeeper2,zookeeper3"
- name: JVMFLAGS
value: "-Xmx2G"
ports:
- containerPort: 2181
- containerPort: 2888
- containerPort: 3888
volumeMounts:
- mountPath: "/zookeeper/data"
name: zookeeper-datadir-pvc-1
volumes:
- name: zookeeper-datadir-pvc-1
persistentVolumeClaim:
claimName: zookeeper-datadir-pvc-1
---
kind: Deployment
apiVersion: apps/v1
metadata:
name: zookeeper2
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: zookeeper
template:
metadata:
labels:
app: zookeeper
server-id: "2"
spec:
volumes:
- name: data
emptyDir: {}
- name: wal
emptyDir: {}
containers:
- name: server
image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/zookeeper:v3.4.14
imagePullPolicy: Always
env:
- name: MYID
value: "2"
- name: SERVERS
value: "zookeeper1,zookeeper2,zookeeper3"
- name: JVMFLAGS
value: "-Xmx2G"
ports:
- containerPort: 2181
- containerPort: 2888
- containerPort: 3888
volumeMounts:
- mountPath: "/zookeeper/data"
name: zookeeper-datadir-pvc-2
volumes:
- name: zookeeper-datadir-pvc-2
persistentVolumeClaim:
claimName: zookeeper-datadir-pvc-2
---
kind: Deployment
apiVersion: apps/v1
metadata:
name: zookeeper3
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: zookeeper
template:
metadata:
labels:
app: zookeeper
server-id: "3"
spec:
volumes:
- name: data
emptyDir: {}
- name: wal
emptyDir: {}
containers:
- name: server
image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/zookeeper:v3.4.14
imagePullPolicy: Always
env:
- name: MYID
value: "3"
- name: SERVERS
value: "zookeeper1,zookeeper2,zookeeper3"
- name: JVMFLAGS
value: "-Xmx2G"
ports:
- containerPort: 2181
- containerPort: 2888
- containerPort: 3888
volumeMounts:
- mountPath: "/zookeeper/data"
name: zookeeper-datadir-pvc-3
volumes:
- name: zookeeper-datadir-pvc-3
persistentVolumeClaim:
claimName: zookeeper-datadir-pvc-3
#执行
# kubectl apply -f zookeeper.yaml
#查看
# kubectl get pod -n web
NAME READY STATUS RESTARTS AGE
zookeeper1-79ff6d995b-kstn7 1/1 Running 0 5d22h
zookeeper2-56dcd47bd5-sdn64 1/1 Running 0 5d22h
zookeeper3-6d9cd4dc6b-n2xhw 1/1 Running 0 5d22h
#验证集群状态
# kubectl exec -it zookeeper1-79ff6d995b-kstn7 bash -n web
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
bash-4.3# /zookeeper/bin/zkServer.sh status
ZooKeeper JMX enabled by default
ZooKeeper remote JMX Port set to 9010
ZooKeeper remote JMX authenticate set to false
ZooKeeper remote JMX ssl set to false
ZooKeeper remote JMX log4j set to true
Using config: /zookeeper/bin/../conf/zoo.cfg
Mode: follower
bash-4.3# exit
exit
# kubectl exec -it zookeeper2-56dcd47bd5-sdn64 bash -n web
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
bash-4.3# /zookeeper/bin/zkServer.sh status
ZooKeeper JMX enabled by default
ZooKeeper remote JMX Port set to 9010
ZooKeeper remote JMX authenticate set to false
ZooKeeper remote JMX ssl set to false
ZooKeeper remote JMX log4j set to true
Using config: /zookeeper/bin/../conf/zoo.cfg
Mode: leader
bash-4.3# exit
exit
# kubectl exec -it zookeeper3-6d9cd4dc6b-n2xhw bash -n web
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
bash-4.3# /zookeeper/bin/zkServer.sh status
ZooKeeper JMX enabled by default
ZooKeeper remote JMX Port set to 9010
ZooKeeper remote JMX authenticate set to false
ZooKeeper remote JMX ssl set to false
ZooKeeper remote JMX log4j set to true
Using config: /zookeeper/bin/../conf/zoo.cfg
Mode: follower
bash-4.3# exit
exit
zookeeper集群状态正常
2.2 部署dubbo-provider
2.2.1 修改dubbo-provider的配置文件
#修改dubbo-server.jar里的config.properties,指定连接k8s的zookeeper集群,修改后的地址如`zookeeper1.web.svc.zhao.local`是可以在k8s环境里ping通的,
# vim config.properties
dubbo.registry=zookeeper://zookeeper1.web.svc.zhao.local:2181?backup=zookeeper2.web.svc.zhao.local:2181,zookeeper3.web.svc.zhao.local:2181
dubbo.port=20880
2.2.2 配置skywalking-agent
#配置skywalking-agent
wget https://archive.apache.org/dist/skywalking/java-agent/8.8.0/apache-skywalking-java-agent-8.8.0.tgz
tar -xf apache-skywalking-java-agent-8.8.0.tgz
vim skywalking-agent/config/agent.config
18 agent.namespace=${SW_AGENT_NAMESPACE:myweb } #自定义namespace,类似于项目名称
21 agent.service_name=${SW_AGENT_NAME:application-provider} #自定义service 名称,类似于具体某个微服务的服务名称
93 collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:172.20.20.211:11800} #指定skywalking地址:gRPC数据端口
2.2.3 准备启动脚本
#准备启动脚本
# cat run_java.sh
#!/bin/bash
su - user1 -c "java -javaagent:/skywalking-agent/skywalking-agent.jar -jar /apps/dubbo/provider/dubbo-server.jar"
2.2.4 准备dockerfile文件并打包
#准备dockerfile文件
# cat Dockerfile
#Dubbo provider
FROM harbor.zhao.net/baseimages/jdk-base:v8.212
RUN yum install file nc -y && useradd user1 -u 2000
RUN mkdir -p /apps/dubbo/provider
ADD dubbo-server.jar /apps/dubbo/provider/
ADD run_java.sh /apps/dubbo/provider/bin/
ADD skywalking-agent/ /skywalking-agent/
RUN chown user1.user1 /apps /skywalking-agent -R
RUN chmod a+x /apps/dubbo/provider/bin/*.sh
CMD ["/apps/dubbo/provider/bin/run_java.sh"]
#打包并上传至harbor
# cat build-command.sh
#!/bin/bash
nerdctl build -t harbor.zhao.net/zhao/dubbo-provider:v1-2023-zookeeper1 .
sleep 1
nerdctl push harbor.zhao.net/zhao/dubbo-provider:v1-2023-zookeeper1
# ./build-command.sh
2.2.5 准备yaml文件并执行
# cat provider.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
labels:
app: myweb-provider
name: myweb-provider-deployment
namespace: web
spec:
replicas: 3
selector:
matchLabels:
app: myweb-provider
template:
metadata:
labels:
app: myweb-provider
spec:
containers:
- name: myweb-provider-container
#image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/dubboadmin:v2.5.3-2022092301-zookeeper1
#image: harbor.magedu.net/magedu/dubbo-provider:v1-2022092301-zookeeper1
image: harbor.zhao.net/zhao/dubbo-provider:v1-2023-zookeeper1
#imagePullPolicy: IfNotPresent
imagePullPolicy: Always
ports:
- containerPort: 8080
protocol: TCP
name: http
---
kind: Service
apiVersion: v1
metadata:
labels:
app: myweb-provider
name: myweb-provider-spec
namespace: web
spec:
type: NodePort
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
nodePort: 38800
selector:
app: myweb-provider
#执行
# kubectl apply -f provider.yaml
#查看
# kubectl get pod -n web |grep provider
myweb-provider-deployment-546f468d48-7bdbz 1/1 Running 0 154m
myweb-provider-deployment-546f468d48-7w9g7 1/1 Running 0 163m
myweb-provider-deployment-546f468d48-zhtkh 1/1 Running 0 163m
# kubectl logs -f -n web myweb-provider-deployment-546f468d48-7bdbz
...
Dubbo server started
Dubbo 服务端已经启动
2.2.6 登录zookeeper客户端验证
2.3 部署dubbo-consumer
2.3.1 修改dubbo-consumer的配置文件
同dubbo-provider一样修改
2.3.2 配置skywalking-agent
方法同dubbo-provider一样
vim skywalking-agent/config/agent.config
18 agent.namespace=${SW_AGENT_NAMESPACE:myweb } #自定义namespace,类似于项目名称
21 agent.service_name=${SW_AGENT_NAME:application-consumer} #自定义service 名称,类似于具体某个微服务的服务名称
93 collector.backend_service=${SW_AGENT_COLLECTOR_BACKEND_SERVICES:172.20.20.211:11800} #指定skywalking地址:gRPC数据端口
2.3.3 准备启动脚本
# cat run_java.sh
#!/bin/bash
su - user1 -c "java -javaagent:/skywalking-agent/skywalking-agent.jar -jar /apps/dubbo/consumer/dubbo-client.jar"
2.3.4 准备dockerfile文件并打包
# cat Dockerfile
#Dubbo consumer
FROM harbor.zhao.net/baseimages/jdk-base:v8.212
RUN yum install file -y
RUN mkdir -p /apps/dubbo/consumer && useradd user1 -u 2000
ADD run_java.sh /apps/dubbo/consumer/bin/
ADD skywalking-agent/ /skywalking-agent/
ADD dubbo-client.jar /apps/dubbo/consumer/dubbo-client.jar
RUN chown user1.user1 /apps /skywalking-agent -R
RUN chmod a+x /apps/dubbo/consumer/bin/*.sh
CMD ["/apps/dubbo/consumer/bin/run_java.sh"]
#打包并上传至harbor
# cat build-command.sh
#!/bin/bash
TAG=$1
nerdctl build -t harbor.zhao.net/zhao/dubbo-consumer:${TAG} .
nerdctl push harbor.zhao.net/zhao/dubbo-consumer:${TAG}
# ./build-command.sh v1-2023-zookeeper1
2.3.5 准备yaml文件并执行
# cat consumer.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
labels:
app: myweb-consumer
name: myweb-consumer-deployment
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: myweb-consumer
template:
metadata:
labels:
app: myweb-consumer
spec:
imagePullSecrets:
containers:
- name: myweb-consumer-container
image: harbor.zhao.net/zhao/dubbo-consumer:v1-2023-zookeeper1
#imagePullPolicy: IfNotPresent
imagePullPolicy: Always
ports:
- containerPort: 8080
protocol: TCP
name: http
---
kind: Service
apiVersion: v1
metadata:
labels:
app: myweb-consumer
name: myweb-consumer-server
namespace: web
spec:
type: NodePort
ports:
- name: http
port: 80
protocol: TCP
targetPort: 8080
nodePort: 30001
selector:
app: myweb-consumer
#执行
# kubectl apply -f consumer.yaml
#检查
# kubectl get pod -n web |grep consumer
myweb-consumer-deployment-5b474fc9bc-dtw4f 1/1 Running 0 142m
# kubectl logs -f -n web myweb-consumer-deployment-5b474fc9bc-dtw4f
...
Dubbo client started
Dubbo 消费者端启动
2.3.6 登录zookeeper客户端验证
2.4 部署dubboadmin容器
该步骤可以省略,只是提供一个查看界面
2.4.1 创建tomcat容器启动脚本
# cat run_tomcat.sh
#!/bin/bash
su - user1 -c "/apps/tomcat/bin/catalina.sh start"
su - user1 -c "tail -f /etc/hosts"
2.4.2 修改dubboadmin配置文件
#修改zookeeper pod服务地址,root和guest用户密码
# unzip dubboadmin.war
# vim dubboadmin/WEB-INF/dubbo.properties
dubbo.registry.address=zookeeper://zookeeper1.web.svc.zhao.local:2181
dubbo.admin.root.password=root
dubbo.admin.guest.password=guest
2.4.3 准备dockerfile文件并打包
# cat Dockerfile
#Dubbo dubboadmin
#FROM harbor.magedu.local/pub-images/tomcat-base:v8.5.43
FROM harbor.zhao.net/pub-images/tomcat-base:v8.5.43
MAINTAINER zhao
ADD server.xml /apps/tomcat/conf/server.xml
ADD logging.properties /apps/tomcat/conf/logging.properties
ADD catalina.sh /apps/tomcat/bin/catalina.sh
ADD run_tomcat.sh /apps/tomcat/bin/run_tomcat.sh
ADD dubboadmin/ /data/tomcat/webapps/dubboadmin
RUN useradd user1 && chown -R user1.user1 /data /apps
EXPOSE 8080 8443
CMD ["/apps/tomcat/bin/run_tomcat.sh"]
#打包并上传至harbor
# cat build-command.sh
#!/bin/bash
TAG=$1
nerdctl build -t harbor.zhao.net/zhao/dubboadmin:${TAG} .
nerdctl push harbor.zhao.net/zhao/dubboadmin:${TAG}
# ./build-command.sh v1-2023
2.4.4 准备yaml文件并执行
# cat dubboadmin.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
labels:
app: myweb-dubboadmin
name: myweb-dubboadmin-deployment
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: myweb-dubboadmin
template:
metadata:
labels:
app: myweb-dubboadmin
spec:
containers:
- name: myweb-dubboadmin-container
image: harbor.zhao.net/zhao/dubboadmin:v1-2023
imagePullPolicy: Always
ports:
- containerPort: 8080
protocol: TCP
name: http
---
kind: Service
apiVersion: v1
metadata:
labels:
app: myweb-dubboadmin
name: myweb-dubboadmin-service
namespace: web
spec:
type: NodePort
ports:
- name: http
port: 80
protocol: TCP
targetPort: 8080
nodePort: 30080
selector:
app: myweb-dubboadmin
#执行
# kubectl apply -f dubboadmin.yaml
#检查
# kubectl get pod -n web |grep dubboadmin
myweb-dubboadmin-deployment-5664b55669-g4ktp 1/1 Running 0 88m
2.4.5 访问web界面
用户名和密码都是root
2.5 访问skywalking界面验证
3.实现skywalking 实现钉钉告警
3.1 相关告警指标
- service_resp_time #服务的响应时间
- service_sla #服务的http请求成功率SLA,比如99%等。
- service_cpm #表示每分钟的吞吐量.
- service_apdex : #应用性能指数是0.8是0.x
- service_percentile: #指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果
- endpoint_relation_cpm #端点的每分钟的吞吐量
- endpoint_relation_resp_time #端点的响应时间
- endpoint_relation_sla #端点的http请求成功率SLA,比如99%等。
- endpoint_relation_percentile #端点的最近多少数据范围内的响应时间百分比,即p99、p95、p90、p75、p50在内的数据统计结果
3.2 配置告警规则
# cat alarm-settings.yml
rules: #定义rule规则
service_cpm_rule: #唯一的规则名称,必须以_rule结尾
metrics-name: service_cpm #指标名称
op: ">" #操作符,>, >=, <, <=, ==
threshold: 1 #指标阈值
period: 2 #评估指标的间隔周期
count: 1 #匹配成功多少次就会触发告警
silence-period: 2 #触发告警后的静默时间
message: dubbo-provider service_cpm 大于1了 #告警信息
dingtalkHooks:
textTemplate: |-
{
"msgtype": "text", "text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=466956bf7168e01a776f2cf5f285754d4e2606362871e80e01fe03ec46f63ec9
3.3 修改skywalking的yaml文件,并启动
# cat docker-compose.yaml
version: '3.3'
services:
es7:
image: elasticsearch:7.10.1
container_name: es7
ports:
- 9200:9200
- 9300:9300
environment:
- discovery.type=single-node #单机模式
- bootstrap.memory_lock=true #锁定物理内存地址
- "ES_JAVA_OPTS=-Xms512m -Xmx512m" #堆内存大小
- TZ=Asia/Shanghai
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- /data/elasticsearch/data:/usr/share/elasticsearch/data
skywalking-oap:
image: apache/skywalking-oap-server:8.6.0-es7
container_name: skywalking-oap
restart: always
volumes:
- /apps/skywalking-docker-compose/alarm-settings.yml:/skywalking/config/alarm-settings.yml #添加告警配置文件的挂载
depends_on:
- es7
links:
- es7
ports:
- 11800:11800
- 12800:12800
environment:
TZ: Asia/Shanghai
SW_STORAGE: elasticsearch7
SW_STORAGE_ES_CLUSTER_NODES: es7:9200
skywalking-ui:
image: apache/skywalking-ui:8.6.0
container_name: skywalking-ui
restart: always
depends_on:
- skywalking-oap
links:
- skywalking-oap
ports:
- 8080:8080
environment:
TZ: Asia/Shanghai
SW_OAP_ADDRESS: skywalking-oap:12800
#启动
# docker-compose up -d
#检查
# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
1ed4152b3ea9 apache/skywalking-ui:8.6.0 "bash docker-entrypo…" About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp skywalking-ui
1c26a6264ecf apache/skywalking-oap-server:8.6.0-es7 "bash docker-entrypo…" About a minute ago Up About a minute 0.0.0.0:11800->11800/tcp, :::11800->11800/tcp, 1234/tcp, 0.0.0.0:12800->12800/tcp, :::12800->12800/tcp skywalking-oap
069df787e6bc elasticsearch:7.10.1 "/tini -- /usr/local…" About a minute ago Up About a minute 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp, 0.0.0.0:9300->9300/tcp, :::9300->9300/tcp es7
3.4 配置钉钉机器人
设置机器人名字、保存webhook地址、添加关键字
3.5 验证
验证skywalking server端触发告警
4.梳理ceph的组件关系
4.1 ceph介绍
Ceph是一个开源的分布式存储系统,同时支持对象存储、块设备、文件系统。
ceph支持EB(1EB=1,000,000,00OGB)级别的数据存储, ceph把每一个待管理的数据流(文件等数据)切分为一到多个固定大小(默认4兆)的对象数据,并以其为原子单元(原子是构成元素的最小单元)完成数据的读写。
ceph 的底层存储服务是由多个存储主机(host)组成的存储集群,该集群也被称之为RADOS(reliable automatic distributed object store)存储集群,即可靠的、自动化的、分布式的对象存储系统。
librados是 RADOS存储集群的API,支持C/C++/JAVApython/ruby/php/go等编程语言客户端。
4.2 ceph集群的组成部分
LIBRADOS、RADOSGW、RBD和Ceph FS 统称为Ceph 客户端接口,RADOSGW、RBD、Ceph FS是基于LIBRADOS提供的多编程语言接口开发的。
一个ceph集群的组成部分:
- 若干的Ceph OSD(对象存储守护程序)
- 至少需要一个 Ceph Monitors 监视器(1,3,5,7...)
- 两个或以上的Ceph 管理器managers
- 运行Ceph 文件系统客户端时还需要高可用的Ceph Metadata Server(文件系统元数据服务器)
- RADOS cluster:由多台host存储服务器组成的ceph集群
- OSD(Object Storage Daemon):每台存储服务器的磁盘组成的存储空间
- Mon(Monitor)::ceph的监视器,维护OSD和PG的集群状态,一个ceph集群至少要有一个mon,可以是一三五七等等这样的奇数个。
- Mgr(Manager):负责跟踪运行时指标和Ceph集群的当前状态,包括存储利用率,当前性能指标和系统负载等.
Monitor(ceph-mon)ceph监视器
在一个主机上运行的一个守护进程,用于维护集群状态的映射(maintains maps of the cluster state),比如ceph集群中有多少存储池、每个存储池有多少PG以及存储池和PG的映射关系等,monitor map,manager map,the OSD map,the MDS map,and the CRUSH map,这些映射是ceph守护程序相互协调所需的关键集群状态,此外监视器还负责管理守护程序和客户端之间的身份验证(认证使用cephX协议),通常至少需要三个监视器才能实现冗余和高可用性。Managers(ceph-mgr)
在一个主机上运行的一个守护进程,Ceph Manager守护程序(ceph-mgr)负责跟踪运行时指标和Ceph集群的当前状态,包括存储利用率,当前性能指标和系统负载。CephManager守护程序还托管基于python的模块来管理和公开Ceph集群信息,包括基于Web的Ceph仪表板和REST API,高可用性通常至少需要两个管理器。Ceph OSDs(对象存储守护程序ceph-osd)
提供存储数据,操作系统上的一个磁盘就是一个OSD守护程序,OSD用于处理ceph集群数据复制,恢复,重新平衡,并通过检查其他Ceph OSD守护程序的心跳来向Ceph监视器和管理器提供一些监视信息。通常至少需要3个Ceph OSD才能实现冗余和高可用性。MDS(ceph元数据服务器ceph-mds)
代表ceph文件系统(NFS/CIFS)存储元数据,即Ceph块设备和Ceph对象存储不使用MDS
4.3 Ceph 的管理节点
- ceph的常用管理接口是一组命令行工具程序,例如rados、ceph、rbd等命令,ceph管理员可以从某个特定的ceph-mon节点执行管理操作
- 推荐使用部署专用的管理节点对ceph进行配置管理、升级与后期维护,方便后期权限管理,管理节点的权限只对管理人员开放,可以避免一些不必要的误操作的发生.
4.4 ceph逻辑组织架构
- Pool: 存储池、分区,存储池的大小取决于底层的存储空间。
- PG(placement group):一个 pool 内部可以有多个PG存在, pool和PG都是抽象的逻辑概念,一个pool中有多少个PG可以通过公式计算。
- OSD(Object Storage Daemon,对象存储设备): 每一块磁盘都是一个 osd, 一个主机由一个或多个 osd 组成.
-
ceph 集群部署好之后,要先创建存储池才能向ceph写入数据,文件在向ceph保存之前要先进行一致性hash计算,计算后会把文件保存在某个对应的PG的,此文件一定属于某个pool的一个PG,在通过PG保存在OSD上。数据对象在写到主 OSD 之后再同步对从OSD以实现数据的高可用。
image.png
5.基于ceph-deploy部署ceph 16.2.x 单节点mon和mgr环境
5.1 部署方式
ceph-ansible:https://github.com/ceph/ceph-ansible #python
ceph-salt:https://github.com/ceph/ceph-salt #python
ceph-container:https://github.com/ceph/ceph-container #shell
ceph-chef:https://github.com/ceph/ceph-chef #Ruby
cephadm: https://docs.ceph.com/en/latest/cephadm/ #ceph 官方在 ceph 15 版本加入的ceph 部署工具
ceph-deploy:https://github.com/ceph/ceph-deploy #python
是一个 ceph 官方维护的基于 ceph-deploy 命令行部署 ceph 集群的工具,基于 ssh 执行可以 sudo 权限的 shell 命令以及一些 python 脚本 实现 ceph 集群的部署和管理维护。
Ceph-deploy 只用于部署和管理 ceph 集群,客户端需要访问 ceph,需要部署客户端工具。
5.2 服务器准备
构建可靠的、低成本的、可扩展的、 与业务紧密结合使用的高性能分布式存储系统
5.2.1 Ceph分布式存储集群规划原则/目标
较低的 TCO (Total Cost of Ownership,总拥有成本):
使用廉价的 X86 服务器。较高的 IOPS (Input/Output Operations Per Second,每秒可完成的读写次数):
使用 SSD/PCI-E SSD/NVMe 硬盘提高存储集群数据以提高读写性能。较大的存储空间:
使用单块 2T/4T 或更大容量的磁盘,提高单台服务器的总空间,节省服务器总数,降低机柜使用量。较快的网络吞吐
使用 10G、40G、100G 或更快的光纤网络更好的数据冗余:
数据可以以三副本机制分别保存到不同的主机,宕机 2 台也不会丢失数据。
5.2.2 服务器硬件选型
https://docs.ceph.com/en/latest/start/hardware-recommendations/#官方硬件推荐
monitor、mgr、radosgw:
4C 8G~16G(小型,专用虚拟机)、8C 16G~32G(中型,专用虚拟机)、16C ~32C 32G~64G(大型/超大型,专用物理机)MDS(相对配置更高一个等级)
8C 8G~16G(小型,专用虚拟机)、16C 16G~32G(中型,专用虚拟机)、32C ~64C 64G~96G(大型、超大型,物理机)OSD 节点 CPU:
每个 OSD 进程至少有一个 CPU 核心或以上,比如服务器一共 2 颗 CPU 每个 12 核心 24线程,那么服务器总计有 48 核心 CPU,这样最多最多最多可以放 48 块磁盘。(物理 CPU 数量 乘以 每颗 CPU 核心) / OSD 磁盘数量 = X/每 OSD CPU 核心 >= 1 核心 CPU 比如:(2 颗*每颗 24 核心) / 24 OSD 磁盘数量= 2/每 OSD CPU 核心 >= 1 核心 CPUOSD 节点内存:
OSD 硬盘空间在 2T 或以内的时候每个硬盘 2G 内存,4T 的空间每个 OSD 磁盘 4G 内存,即大约每 1T 的磁盘空间(最少)分配 1G 的内存空间做数据读写缓存。
(总内存/OSD 磁盘总空间)= X > 1G
内存比如: (总内存 128G/36T 磁盘总空间 )= 3G/每 T > 1G 内存
5.2.3 部署环境
- 四台服务器作为 ceph 集群 OSD 存储服务器,每台服务器支持两个网络,public 网络针对客户端访问,cluster 网络用于集群管理及数据同步,每台三块或以上的磁盘
172.20.20.226/192.168.20.226
172.20.20.227/192.168.20.227
172.20.20.228/192.168.20.228
172.20.20.229/192.168.20.229
各存储服务器磁盘划分:
/dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf #每块100G
- 三台服务器作为 ceph 集群 Mon 监视服务器,每台服务器可以和 ceph 集群的 cluster 网络通信。
172.20.20.221/192.168.20.221
172.20.20.222/192.168.20.222
172.20.20.223/192.168.20.223
- 两个 ceph-mgr 管理服务器,可以和 ceph 集群的 cluster 网络通信
172.20.20.224/192.168.20.224
172.20.20.225/192.168.20.225
- 一个服务器用于部署 ceph 集群即安装 Ceph-deploy,也可以和 ceph-mgr 等复用。
172.20.20.220/192.168.20.220
- 创建一个普通用户,能够通过 sudo 执行特权命令,配置主机名解析,ceph 集群部署过程中需要对各主机配置不同的主机名,另外如果是 centos 系统则需要关闭各服务器的防火墙和 selinux。
5.3 部署 RADOS 集群
5.3.1 仓库准备
在所有节点执行
#操作系统版本 Ubuntu 20.04
#支持 https 镜像仓库源:
apt install -y apt-transport-https ca-certificates curl software-properties-common
#导入 key:
wget -q -O- 'https://mirrors.tuna.tsinghua.edu.cn/ceph/keys/release.asc' | sudo apt-key add -
#添加ceph仓库地址并更新apt仓库
echo 'deb https://mirrors.tuna.tsinghua.edu.cn/ceph/debian-pacific/ focal main' >> /etc/apt/sources.list && apt update
5.3.2 创建 ceph 集群部署用户 cephadmin
推荐使用指定的普通用户部署和运行 ceph 集群,普通用户只要能以非交互方式执行 sudo命令执行一些特权命令即可,新版的 ceph-deploy 可以指定包含 root 的在内只要可以执行 sudo 命令的用户,不过仍然推荐使用普通用户,ceph 集群安装完成后会自动创建ceph 用户(ceph 集群默认会使用 ceph 用户运行各服务进程,如 ceph-osd 等),因此推荐使用除了ceph 之外的比如 cephuser、cephadmin 这样的普通用户去部署和管理ceph集群。
cephadmin 仅用于通过 ceph-deploy 部署和管理 ceph 集群的时候使用,比如首次初始化集群和部署集群、添加节点、删除节点等,ceph 集群在 node 节点、mgr 等节点会使用ceph 用户启动服务进程。
#在包含 ceph-deploy 节点的存储node节点、mon 节点和 mgr 节点等创建 cephadmin 用户
groupadd -r -g 2022 cephadmin && useradd -r -m -s /bin/bash -u 2022 -g 2022 cephadmin && echo cephadmin:123456 | chpasswd
#各节点允许 cephadmin 用户以 sudo 执行特权命令
echo "cephadmin ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
#配置cephadmin免密登录
root@ceph-deploy:~# su - cephadmin
cephadmin@ceph-deploy:~$ ssh-keygen
cephadmin@ceph-deploy:~$ for n in {220..229};do ssh-copy-id cephadmin@172.20.20.$n;done
5.3.3 配置主机名解析
所有ceph节点服务器都配置一样的域名解析hosts文件
cat >>/etc/hosts <<EOF
172.20.20.220 ceph-deploy.example.local ceph-deploy
172.20.20.221 ceph-mon1.example.local ceph-mon1
172.20.20.222 ceph-mon2.example.local ceph-mon2
172.20.20.223 ceph-mon3.example.local ceph-mon3
172.20.20.224 ceph-mgr1.example.local ceph-mgr1
172.20.20.225 ceph-mgr2.example.local ceph-mgr2
172.20.20.226 ceph-node1.example.local ceph-node1
172.20.20.227 ceph-node2.example.local ceph-node2
172.20.20.228 ceph-node3.example.local ceph-node3
172.20.20.229 ceph-node4.example.local ceph-node4
EOF
5.3.4 安装ceph-deploy 部署工具
apt update
apt install python3-pip -y
wget https://github.com/ceph/ceph-deploy/archive/refs/tags/v2.1.0.zip
unzip v2.1.0.zip
cd ceph-deploy-2.1.0/
python3 setup.py install
#由于安装需要的remeto版本不符合要求,需要先升级remoto
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple remoto
最后查看版本
root@ceph-deploy:~# ceph-deploy --version
2.1.0
ceph-deploy用法
ceph-deploy -h
usage: ceph-deploy [-h] [-v | -q] [--version] [--username USERNAME] [--overwrite-conf] [--ceph-conf CEPH_CONF] COMMAND ...
Easy Ceph deployment
-^-
/ \
|O o| ceph-deploy v2.1.0
).-.(
'/|||\`
| '|` |
'|`
Full documentation can be found at: http://ceph.com/ceph-deploy/docs
optional arguments:
-h, --help show this help message and exit
-v, --verbose be more verbose
-q, --quiet be less verbose
--version the current installed version of ceph-deploy
--username USERNAME the username to connect to the remote host
--overwrite-conf overwrite an existing conf file on remote host (if present)
--ceph-conf CEPH_CONF
use (or reuse) a given ceph.conf file
commands:
COMMAND description
new 开始部署一个新的ceph集群,并生成cluster.conf集群配置文件和keyring认证文件
install 在远程主机上安装ceph相关软件包
mds 管理MGR守护程序(ceph-mgr,Ceph Manager DaemonCeph管理器守护程序)
mgr 管理MDS守护程序(Ceph Metadata Server,ceph源数据服务器)
mon 管理MON守护程序(ceph-mon,ceph 监视器)
rgw 管理RGW守护程序(RADOSGW,对象存储网关)
gatherkeys 从指定获取提供新节点的验证 keys,这些 keys会在添加新的MON/OSD/MD加入的时候使用
disk 管理远程主机磁盘。
osd 在远程主机准备数据磁盘,即将指定远程主机的指定磁盘添加到ceph集群作为osd使用
admin 推送ceph 集群配置文件和 client.admin 认证文件到远程主机
config 将ceph.conf 配置文件推送到远程主机或从远程主机拷贝
repo 远程主机仓库管理
purge 删除远端主机的安装包和所有数据
purgedata 从/var/lib/ceph 删除ceph 数据,会删除/etc/ceph 下的内容
uninstall 从远端主机删除安装包
forgetkeys 从本地主机删除所有的验证 keyring,包括client.admin, monitor, bootstrap等认证文件
pkg 管理远端主机的安装包
See 'ceph-deploy <command> --help' for help on a specific command
5.3.5 部署mon 节点
#各节点服务器需要单独安装 Python2
sudo apt install python2.7 -y
sudo ln -s /usr/bin/python2.7 /usr/bin/python2
#在部署节点,切换到cephadmin普通用户下操作
root@ceph-deploy:~# su - cephadmin
#创建ceph集群初始化目录
cephadmin@ceph-deploy:~$ mkdir ceph-cluster
cephadmin@ceph-deploy:~$ cd ceph-cluster/
#在部署节点初始化 mon 节点
#ceph-deploy new命令参数说明
# --cluster-network 指定ceph集群管理网络
# --public-network 指定ceph后端存储业务网络
# ceph-mon1.example.local 指定mon1节点的域名\主机名
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy new --cluster-network 192.168.20.0/24 --public-network 172.20.20.0/24 ceph-mon1.example.local
[ceph_deploy.conf][DEBUG ] found configuration file at: /home/cephadmin/.cephdeploy.conf
[ceph_deploy.cli][INFO ] Invoked (2.1.0): /usr/local/bin/ceph-deploy new --cluster-network 192.168.20.0/24 --public-network 172.20.20.0/24 ceph-mon1.example.local
[ceph_deploy.cli][INFO ] ceph-deploy options:
[ceph_deploy.cli][INFO ] verbose : False
[ceph_deploy.cli][INFO ] quiet : False
[ceph_deploy.cli][INFO ] username : None
[ceph_deploy.cli][INFO ] overwrite_conf : False
[ceph_deploy.cli][INFO ] ceph_conf : None
[ceph_deploy.cli][INFO ] cluster : ceph
[ceph_deploy.cli][INFO ] mon : ['ceph-mon1.example.local']
[ceph_deploy.cli][INFO ] ssh_copykey : True
[ceph_deploy.cli][INFO ] fsid : None
[ceph_deploy.cli][INFO ] cluster_network : 192.168.20.0/24
[ceph_deploy.cli][INFO ] public_network : 172.20.20.0/24
[ceph_deploy.cli][INFO ] cd_conf : <ceph_deploy.conf.cephdeploy.Conf object at 0x7f0409a1c910>
[ceph_deploy.cli][INFO ] default_release : False
[ceph_deploy.cli][INFO ] func : <function new at 0x7f0409a0adc0>
[ceph_deploy.new][DEBUG ] Creating new cluster named ceph
[ceph_deploy.new][INFO ] making sure passwordless SSH succeeds
[ceph-mon1.example.local][DEBUG ] connected to host: ceph-deploy.example.local
[ceph-mon1.example.local][INFO ] Running command: ssh -CT -o BatchMode=yes ceph-mon1.example.local true
[ceph_deploy.new][WARNIN] could not connect via SSH
[ceph_deploy.new][INFO ] will connect again with password prompt
The authenticity of host 'ceph-mon1.example.local (172.20.20.221)' can't be established.
ECDSA key fingerprint is SHA256:szBRNcq/Y56Th5ocRWRnF2/X58UAbwpzlDom6Juu4o0.
Are you sure you want to continue connecting (yes/no/[fingerprint])? yes
Warning: Permanently added 'ceph-mon1.example.local' (ECDSA) to the list of known hosts.
[ceph-mon1.example.local][DEBUG ] connected to host: ceph-mon1.example.local
[ceph_deploy.new][INFO ] adding public keys to authorized_keys
[ceph-mon1.example.local][DEBUG ] connection detected need for sudo
[ceph-mon1.example.local][DEBUG ] connected to host: ceph-mon1.example.local
[ceph-mon1.example.local][INFO ] Running command: sudo /bin/ip link show
[ceph-mon1.example.local][INFO ] Running command: sudo /bin/ip addr show
[ceph-mon1.example.local][DEBUG ] IP addresses found: ['172.20.20.221', '192.168.20.221']
[ceph_deploy.new][DEBUG ] Resolving host ceph-mon1.example.local
[ceph_deploy.new][DEBUG ] Monitor ceph-mon1 at 172.20.20.221
[ceph_deploy.new][DEBUG ] Monitor initial members are ['ceph-mon1']
[ceph_deploy.new][DEBUG ] Monitor addrs are ['172.20.20.221']
[ceph_deploy.new][DEBUG ] Creating a random mon key...
[ceph_deploy.new][DEBUG ] Writing monitor keyring to ceph.mon.keyring...
[ceph_deploy.new][DEBUG ] Writing initial config to ceph.conf...
#查看生成的配置文件
cephadmin@ceph-deploy:~/ceph-cluster$ ll
total 484
drwxrwxr-x 2 cephadmin cephadmin 4096 Oct 26 03:38 ./
drwxr-xr-x 5 cephadmin cephadmin 4096 Oct 26 03:30 ../
-rw-rw-r-- 1 cephadmin cephadmin 266 Oct 26 03:34 ceph.conf
-rw-rw-r-- 1 cephadmin cephadmin 450884 Oct 26 06:11 ceph-deploy-ceph.log
-rw------- 1 cephadmin cephadmin 73 Oct 26 03:34 ceph.mon.keyring
# ceph配置文件
cephadmin@ceph-deploy:~/ceph-cluster$ cat ceph.conf
[global]
fsid = 3586e7d1-9315-44e5-85bd-6bd3787ce574
public_network = 172.20.20.0/24
cluster_network = 192.168.20.0/24
mon_initial_members = ceph-mon1
mon_host = 172.20.20.221
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
#在mon节点安装 mon服务
root@ceph-mon1:~# apt install ceph-mon
#ceph集群添加ceph-mon服务
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy mon create-initial
#验证mon节点
root@ceph-mon1:~# ps -ef |grep ceph-mon
ceph 10408 1 0 03:38 ? 00:00:00 /usr/bin/ceph-mon -f --cluster ceph --id ceph-mon1 --setuser ceph --setgroup ceph
root 10946 3486 0 03:39 pts/1 00:00:00 grep --color=auto ceph-mon
5.3.6 部署mgr 节点
#在mgr节点先安装ceph-mgr\
root@ceph-mgr1:~# apt install ceph-mgr
#在ceph-deploy节点初始化ceph-mgr
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy mgr create ceph-mgr1
#验证mgr节点
root@ceph-mgr1:~# ps -ef|grep ceph-mgr
ceph 16710 1 28 03:43 ? 00:00:04 /usr/bin/ceph-mgr -f --cluster ceph --id ceph-mgr1 --setuser ceph --setgroup ceph
root 16878 2611 0 03:43 pts/1 00:00:00 grep --color=auto ceph-mgr
5.3.7 分发 admin 秘钥
-
在ceph-deploy 节点把配置文件和 admin 密钥拷贝至 Ceph 集群需要执行 ceph 管理命令的节点,从而不需要后期通过 ceph 命令对 ceph 集群进行管理配置的时候每次都需要指定ceph-mon 节点地址和 ceph.client.admin.keyring 文件,另外各 ceph-mon 节点也需要同步ceph 的集群配置文件与认证文件。
在ceph-deploy 节点管理集群就需要安装ceph-common组件包含了ceph相关命令
#安装
cephadmin@ceph-deploy:~/ceph-cluster$ sudo apt install ceph-common
#推送admin秘钥到本机部署节点
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy admin ceph-deploy
#推送mon节点
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy admin ceph-mon1
#推送node节点
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy admin ceph-node1 ceph-node2 ceph-node3 ceph-node4
#ceph 节点验证秘钥
root@ceph-node4:/etc/systemd/system/ceph.target.wants# ls /etc/ceph/
ceph.client.admin.keyring ceph.conf rbdmap tmpymzaej_l
认证文件的属主和属组为了安全考虑,默认设置为了 root 用户和 root 组,如果需要 cephadmin 用户也能执行 ceph 命令,那么就需要对 cephadmin 用户进行授权
#在所有节点执行
root@ceph-node4:~# apt install acl -y
root@ceph-node4:~# setfacl -m u:cephadmin:rw /etc/ceph/ceph.client.admin.keyring
#查看ceph集群状态
cephadmin@ceph-deploy:~/ceph-cluster$ ceph -s
cluster:
id: 3586e7d1-9315-44e5-85bd-6bd3787ce574
health: HEALTH_WARN
mon is allowing insecure global_id reclaim #需要禁用非安全模式通信
OSD count 0 < osd_pool_default_size 3 #集群osd数量小于3
services:
mon: 1 daemons, quorum ceph-mon1 (age 15m)
mgr: ceph-mgr1(active, since 10m)
osd: 0 osds: 0 up, 0 in
data:
pools: 0 pools, 0 pgs
objects: 0 objects, 0 B
usage: 0 B used, 0 B / 0 B avail
pgs:
#禁用mon非安全模式通信
cephadmin@ceph-deploy:~/ceph-cluster$ ceph config set mon auth_allow_insecure_global_id_reclaim false
#再次查看集群状态
cephadmin@ceph-deploy:~/ceph-cluster$ ceph -s
cluster:
id: 3586e7d1-9315-44e5-85bd-6bd3787ce574
health: HEALTH_WARN
OSD count 0 < osd_pool_default_size 3
services:
mon: 1 daemons, quorum ceph-mon1 (age 17m)
mgr: ceph-mgr1(active, since 12m)
osd: 0 osds: 0 up, 0 in
data:
pools: 0 pools, 0 pgs
objects: 0 objects, 0 B
usage: 0 B used, 0 B / 0 B avail
pgs:
6.对ceph集群添加node节点及OSD
6.1 初始化 ceph 存储节点
- 初始化存储节点等于在存储节点安装了 ceph 及 ceph-rodsgw 安装包,但是使用默认的官方仓库会因为网络原因导致初始化超时,因此各存储节点推荐修改 ceph 仓库为阿里或者清华等国内的镜像源,具体方法看上面的
5.3.1章节
6.2 初始化node存储节点
#参数介绍
#--no-adjust-repos #不修改已有的 apt 仓库源(默认会使用官方仓库)
#--nogpgcheck #不进行校验
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy install --no-adjust-repos --nogpgcheck ceph-node1 ceph-node2 ceph-node3 ceph-node4
6.3 node节点部署ceph运行环境
#在ceph-deploy节点执行,OSD节点安装运行环境
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy install --release pacific ceph-node1 ceph-node2 ceph-node3 ceph-node4
6.4 使用 ceph-deploy disk zap 擦除各 ceph node 的 ceph 数据磁盘
#在ceph-deploy部署节点执行
ceph-deploy disk zap ceph-node1 /dev/sdb
ceph-deploy disk zap ceph-node1 /dev/sdc
ceph-deploy disk zap ceph-node1 /dev/sdd
ceph-deploy disk zap ceph-node1 /dev/sde
ceph-deploy disk zap ceph-node1 /dev/sdf
ceph-deploy disk zap ceph-node2 /dev/sdb
ceph-deploy disk zap ceph-node2 /dev/sdc
ceph-deploy disk zap ceph-node2 /dev/sdd
ceph-deploy disk zap ceph-node2 /dev/sde
ceph-deploy disk zap ceph-node2 /dev/sdf
ceph-deploy disk zap ceph-node3 /dev/sdb
ceph-deploy disk zap ceph-node3 /dev/sdc
ceph-deploy disk zap ceph-node3 /dev/sdd
ceph-deploy disk zap ceph-node3 /dev/sde
ceph-deploy disk zap ceph-node3 /dev/sdf
ceph-deploy disk zap ceph-node4 /dev/sdb
ceph-deploy disk zap ceph-node4 /dev/sdc
ceph-deploy disk zap ceph-node4 /dev/sdd
ceph-deploy disk zap ceph-node4 /dev/sde
ceph-deploy disk zap ceph-node4 /dev/sdf
过程如下
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy disk zap ceph-node1 /dev/sdb
[ceph_deploy.conf][DEBUG ] found configuration file at: /home/cephadmin/.cephdeploy.conf
[ceph_deploy.cli][INFO ] Invoked (2.1.0): /usr/local/bin/ceph-deploy disk zap ceph-node1 /dev/sdb
[ceph_deploy.cli][INFO ] ceph-deploy options:
[ceph_deploy.cli][INFO ] verbose : False
[ceph_deploy.cli][INFO ] quiet : False
[ceph_deploy.cli][INFO ] username : None
[ceph_deploy.cli][INFO ] overwrite_conf : False
[ceph_deploy.cli][INFO ] ceph_conf : None
[ceph_deploy.cli][INFO ] cluster : ceph
[ceph_deploy.cli][INFO ] subcommand : zap
[ceph_deploy.cli][INFO ] cd_conf : <ceph_deploy.conf.cephdeploy.Conf object at 0x7f3c5513a8e0>
[ceph_deploy.cli][INFO ] default_release : False
[ceph_deploy.cli][INFO ] func : <function disk at 0x7f3c55102160>
[ceph_deploy.cli][INFO ] host : ceph-node1
[ceph_deploy.cli][INFO ] disk : ['/dev/sdb']
[ceph_deploy.cli][INFO ] debug : False
[ceph_deploy.osd][DEBUG ] zapping /dev/sdb on ceph-node1
[ceph-node1][DEBUG ] connection detected need for sudo
[ceph-node1][DEBUG ] connected to host: ceph-node1
[ceph_deploy.osd][INFO ] Distro info: ubuntu 20.04 focal
[ceph-node1][INFO ] Running command: sudo /usr/sbin/ceph-volume lvm zap /dev/sdb
[ceph-node1][WARNIN] --> Zapping: /dev/sdb
[ceph-node1][WARNIN] --> --destroy was not specified, but zapping a whole device will remove the partition table
[ceph-node1][WARNIN] Running command: /usr/bin/dd if=/dev/zero of=/dev/sdb bs=1M count=10 conv=fsync
[ceph-node1][WARNIN] stderr: 10+0 records in
[ceph-node1][WARNIN] 10+0 records out
[ceph-node1][WARNIN] 10485760 bytes (10 MB, 10 MiB) copied, 0.0414129 s, 253 MB/s
[ceph-node1][WARNIN] --> Zapping successful for: <Raw Device: /dev/sdb>
6.5 配置osd
- 数据分类保存方式
- Data:即 ceph 保存的对象数据
- Block: rocks DB 数据即元数据
- block-wal:数据库的 wal 日志
- 单块磁盘:
- 机械硬盘或者SSD:
- data:即ceph保存的对象数据
- block:rocks DB数据即元数据
- block-wal:数据库的wal日志
- 两块磁盘:
- SSD:
- block:rocks DB数据即元数据
- block-wal:数据库的wal日志
- 机械硬盘
- data:即ceph保存的对象数据
- 三块硬盘:
- NVME:
- block:rocks DB数据即元数据
- SSD:
- block-wal:数据库的wal日志
- 机械硬盘:
- data:即ceph保存的对象数据
6.6 添加osd
#在ceph-deploy节点执行,创建osd
ceph-deploy osd create ceph-node1 --data /dev/sdb
ceph-deploy osd create ceph-node1 --data /dev/sdc
ceph-deploy osd create ceph-node1 --data /dev/sdd
ceph-deploy osd create ceph-node1 --data /dev/sde
ceph-deploy osd create ceph-node1 --data /dev/sdf
ceph-deploy osd create ceph-node2 --data /dev/sdb
ceph-deploy osd create ceph-node2 --data /dev/sdc
ceph-deploy osd create ceph-node2 --data /dev/sdd
ceph-deploy osd create ceph-node2 --data /dev/sde
ceph-deploy osd create ceph-node2 --data /dev/sdf
ceph-deploy osd create ceph-node3 --data /dev/sdb
ceph-deploy osd create ceph-node3 --data /dev/sdc
ceph-deploy osd create ceph-node3 --data /dev/sdd
ceph-deploy osd create ceph-node3 --data /dev/sde
ceph-deploy osd create ceph-node3 --data /dev/sdf
ceph-deploy osd create ceph-node4 --data /dev/sdb
ceph-deploy osd create ceph-node4 --data /dev/sdc
ceph-deploy osd create ceph-node4 --data /dev/sdd
ceph-deploy osd create ceph-node4 --data /dev/sde
ceph-deploy osd create ceph-node4 --data /dev/sdf
过程如下
cephadmin@ceph-deploy:~/ceph-cluster$ ceph-deploy osd create ceph-node1 --data /dev/sdb
[ceph_deploy.conf][DEBUG ] found configuration file at: /home/cephadmin/.cephdeploy.conf
[ceph_deploy.cli][INFO ] Invoked (2.1.0): /usr/local/bin/ceph-deploy osd create ceph-node1 --data /dev/sdb
[ceph_deploy.cli][INFO ] ceph-deploy options:
[ceph_deploy.cli][INFO ] verbose : False
[ceph_deploy.cli][INFO ] quiet : False
[ceph_deploy.cli][INFO ] username : None
[ceph_deploy.cli][INFO ] overwrite_conf : False
[ceph_deploy.cli][INFO ] ceph_conf : None
[ceph_deploy.cli][INFO ] cluster : ceph
[ceph_deploy.cli][INFO ] subcommand : create
[ceph_deploy.cli][INFO ] cd_conf : <ceph_deploy.conf.cephdeploy.Conf object at 0x7ff7a659b550>
[ceph_deploy.cli][INFO ] default_release : False
[ceph_deploy.cli][INFO ] func : <function osd at 0x7ff7a65420d0>
[ceph_deploy.cli][INFO ] data : /dev/sdb
[ceph_deploy.cli][INFO ] journal : None
[ceph_deploy.cli][INFO ] zap_disk : False
[ceph_deploy.cli][INFO ] fs_type : xfs
[ceph_deploy.cli][INFO ] dmcrypt : False
[ceph_deploy.cli][INFO ] dmcrypt_key_dir : /etc/ceph/dmcrypt-keys
[ceph_deploy.cli][INFO ] filestore : None
[ceph_deploy.cli][INFO ] bluestore : None
[ceph_deploy.cli][INFO ] block_db : None
[ceph_deploy.cli][INFO ] block_wal : None
[ceph_deploy.cli][INFO ] host : ceph-node1
[ceph_deploy.cli][INFO ] debug : False
[ceph_deploy.osd][DEBUG ] Creating OSD on cluster ceph with data device /dev/sdb
[ceph-node1][DEBUG ] connection detected need for sudo
[ceph-node1][DEBUG ] connected to host: ceph-node1
[ceph_deploy.osd][INFO ] Distro info: ubuntu 20.04 focal
[ceph_deploy.osd][DEBUG ] Deploying osd to ceph-node1
[ceph-node1][WARNIN] osd keyring does not exist yet, creating one
[ceph-node1][INFO ] Running command: sudo /usr/sbin/ceph-volume --cluster ceph lvm create --bluestore --data /dev/sdb
[ceph-node1][WARNIN] Running command: /usr/bin/ceph-authtool --gen-print-key
[ceph-node1][WARNIN] Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring -i - osd new ec6ffdcb-14ba-4cbd-afd4-35a302099686
[ceph-node1][WARNIN] Running command: vgcreate --force --yes ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444 /dev/sdb
[ceph-node1][WARNIN] stdout: Physical volume "/dev/sdb" successfully created.
[ceph-node1][WARNIN] stdout: Volume group "ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444" successfully created
[ceph-node1][WARNIN] Running command: lvcreate --yes -l 25599 -n osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686 ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444
[ceph-node1][WARNIN] stdout: Logical volume "osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686" created.
[ceph-node1][WARNIN] Running command: /usr/bin/ceph-authtool --gen-print-key
[ceph-node1][WARNIN] Running command: /usr/bin/mount -t tmpfs tmpfs /var/lib/ceph/osd/ceph-0
[ceph-node1][WARNIN] --> Executable selinuxenabled not in PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin
[ceph-node1][WARNIN] Running command: /usr/bin/chown -h ceph:ceph /dev/ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444/osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /dev/dm-1
[ceph-node1][WARNIN] Running command: /usr/bin/ln -s /dev/ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444/osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686 /var/lib/ceph/osd/ceph-0/block
[ceph-node1][WARNIN] Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring mon getmap -o /var/lib/ceph/osd/ceph-0/activate.monmap
[ceph-node1][WARNIN] stderr: 2023-10-26T06:07:46.432+0000 7f988cf9b700 -1 auth: unable to find a keyring on /etc/ceph/ceph.client.bootstrap-osd.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,: (2) No such file or directory
[ceph-node1][WARNIN] stderr: 2023-10-26T06:07:46.432+0000 7f988cf9b700 -1 AuthRegistry(0x7f988805bf58) no keyring found at /etc/ceph/ceph.client.bootstrap-osd.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,, disabling cephx
[ceph-node1][WARNIN] stderr: got monmap epoch 1
[ceph-node1][WARNIN] --> Creating keyring file for osd.0
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/osd/ceph-0/keyring
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/osd/ceph-0/
[ceph-node1][WARNIN] Running command: /usr/bin/ceph-osd --cluster ceph --osd-objectstore bluestore --mkfs -i 0 --monmap /var/lib/ceph/osd/ceph-0/activate.monmap --keyfile - --osd-data /var/lib/ceph/osd/ceph-0/ --osd-uuid ec6ffdcb-14ba-4cbd-afd4-35a302099686 --setuser ceph --setgroup ceph
[ceph-node1][WARNIN] stderr: 2023-10-26T06:07:46.708+0000 7f7a8c04e080 -1 bluestore(/var/lib/ceph/osd/ceph-0/) _read_fsid unparsable uuid
[ceph-node1][WARNIN] --> ceph-volume lvm prepare successful for: /dev/sdb
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/osd/ceph-0
[ceph-node1][WARNIN] Running command: /usr/bin/ceph-bluestore-tool --cluster=ceph prime-osd-dir --dev /dev/ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444/osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686 --path /var/lib/ceph/osd/ceph-0 --no-mon-config
[ceph-node1][WARNIN] Running command: /usr/bin/ln -snf /dev/ceph-fad041f8-1fbc-4eca-9ad0-e2f820134444/osd-block-ec6ffdcb-14ba-4cbd-afd4-35a302099686 /var/lib/ceph/osd/ceph-0/block
[ceph-node1][WARNIN] Running command: /usr/bin/chown -h ceph:ceph /var/lib/ceph/osd/ceph-0/block
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /dev/dm-1
[ceph-node1][WARNIN] Running command: /usr/bin/chown -R ceph:ceph /var/lib/ceph/osd/ceph-0
[ceph-node1][WARNIN] Running command: /usr/bin/systemctl enable ceph-volume@lvm-0-ec6ffdcb-14ba-4cbd-afd4-35a302099686
[ceph-node1][WARNIN] stderr: Created symlink /etc/systemd/system/multi-user.target.wants/ceph-volume@lvm-0-ec6ffdcb-14ba-4cbd-afd4-35a302099686.service → /lib/systemd/system/ceph-volume@.service.
[ceph-node1][WARNIN] Running command: /usr/bin/systemctl enable --runtime ceph-osd@0
[ceph-node1][WARNIN] stderr: Created symlink /run/systemd/system/ceph-osd.target.wants/ceph-osd@0.service → /lib/systemd/system/ceph-osd@.service.
[ceph-node1][WARNIN] Running command: /usr/bin/systemctl start ceph-osd@0
[ceph-node1][WARNIN] --> ceph-volume lvm activate successful for osd ID: 0
[ceph-node1][WARNIN] --> ceph-volume lvm create successful for: /dev/sdb
[ceph-node1][INFO ] checking OSD status...
[ceph-node1][INFO ] Running command: sudo /bin/ceph --cluster=ceph osd stat --format=json
[ceph_deploy.osd][DEBUG ] Host ceph-node1 is now ready for osd use.
检查ceph集群状态
cephadmin@ceph-deploy:~/ceph-cluster$ ceph -s
cluster:
id: 3586e7d1-9315-44e5-85bd-6bd3787ce574
health: HEALTH_OK
services:
mon: 1 daemons, quorum ceph-mon1 (age 2h)
mgr: ceph-mgr1(active, since 2h)
osd: 20 osds: 20 up (since 6s), 20 in (since 15s)
data:
pools: 1 pools, 1 pgs
objects: 0 objects, 0 B
usage: 5.7 GiB used, 1.9 TiB / 2.0 TiB avail
pgs: 1 active+clean
设置osd服务自启动
#node1节点
root@ceph-node1:~# ps -ef |grep osd
ceph 49905 1 0 06:07 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph
ceph 51838 1 0 06:08 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 1 --setuser ceph --setgroup ceph
ceph 53796 1 0 06:08 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 2 --setuser ceph --setgroup ceph
ceph 55767 1 0 06:08 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 3 --setuser ceph --setgroup ceph
ceph 57728 1 0 06:08 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 4 --setuser ceph --setgroup ceph
root 59156 3757 0 06:14 pts/1 00:00:00 grep --color=auto osd
root@ceph-node1:~# systemctl enable ceph-osd@0 ceph-osd@1 ceph-osd@2 ceph-osd@3 ceph-osd@4
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@0.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@1.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@2.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@3.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@4.service → /lib/systemd/system/ceph-osd@.service.
#node2节点
root@ceph-node2:~# ps -ef |grep osd
ceph 94225 1 0 06:08 ? 00:00:03 /usr/bin/ceph-osd -f --cluster ceph --id 5 --setuser ceph --setgroup ceph
ceph 96278 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 6 --setuser ceph --setgroup ceph
ceph 98240 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 7 --setuser ceph --setgroup ceph
ceph 100199 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 8 --setuser ceph --setgroup ceph
ceph 102173 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 9 --setuser ceph --setgroup ceph
root 103526 3293 0 06:14 pts/1 00:00:00 grep --color=auto osd
root@ceph-node2:~# systemctl enable ceph-osd@5 ceph-osd@6 ceph-osd@7 ceph-osd@8 ceph-osd@9
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@5.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@6.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@7.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@8.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@9.service → /lib/systemd/system/ceph-osd@.service.
#node3节点
root@ceph-node3:/etc/apt# ps -ef |grep osd
ceph 51897 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 10 --setuser ceph --setgroup ceph
ceph 53921 1 0 06:09 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 11 --setuser ceph --setgroup ceph
ceph 55905 1 0 06:10 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 12 --setuser ceph --setgroup ceph
ceph 57830 1 0 06:10 ? 00:00:02 /usr/bin/ceph-osd -f --cluster ceph --id 13 --setuser ceph --setgroup ceph
ceph 59778 1 0 06:10 ? 00:00:01 /usr/bin/ceph-osd -f --cluster ceph --id 14 --setuser ceph --setgroup ceph
root 60967 2514 0 06:14 pts/1 00:00:00 grep --color=auto osd
root@ceph-node3:/etc/apt# systemctl enable ceph-osd@10 ceph-osd@11 ceph-osd@12 ceph-osd@13 ceph-osd@14
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@10.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@11.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@12.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@13.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@14.service → /lib/systemd/system/ceph-osd@.service.
#node4节点
root@ceph-node4:~# ps -ef|grep osd
ceph 94054 1 1 06:10 ? 00:00:01 /usr/bin/ceph-osd -f --cluster ceph --id 15 --setuser ceph --setgroup ceph
ceph 96034 1 1 06:11 ? 00:00:00 /usr/bin/ceph-osd -f --cluster ceph --id 16 --setuser ceph --setgroup ceph
ceph 97996 1 1 06:11 ? 00:00:00 /usr/bin/ceph-osd -f --cluster ceph --id 17 --setuser ceph --setgroup ceph
ceph 99905 1 1 06:11 ? 00:00:00 /usr/bin/ceph-osd -f --cluster ceph --id 18 --setuser ceph --setgroup ceph
ceph 101915 1 1 06:11 ? 00:00:00 /usr/bin/ceph-osd -f --cluster ceph --id 19 --setuser ceph --setgroup ceph
root 102548 2139 0 06:12 pts/1 00:00:00 grep --color=auto osd
root@ceph-node4:/etc/systemd/system/ceph.target.wants# systemctl enable ceph-osd@15 ceph-osd@16 ceph-osd@17 ceph-osd@18 ceph-osd@19
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@15.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@16.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@17.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@18.service → /lib/systemd/system/ceph-osd@.service.
Created symlink /etc/systemd/system/ceph-osd.target.wants/ceph-osd@19.service → /lib/systemd/system/ceph-osd@.service.
6.7 从 RADOS 移除 OSD
#停用设备
ceph osd out {osd-num}
root@ceph-deploy:/home/cephadmin/ceph-cluster# ceph osd out osd.2
#停止进程
systemctl stop ceph-osd@{osd-num}
[root@ceph-node1 ~]# systemctl stop ceph-osd@2.service
# 移除设备
ceph osd purge {id} --yes-i-really-mean-it
root@ceph-deploy:/home/cephadmin/ceph-cluster# ceph osd purge osd.2 --yes-i-really-mean-it
6.8 测试上传与下载数据
#创建 pool
cephadmin@ceph-deploy:~/ceph-cluster$ ceph osd pool create mypool 32 32
pool 'mypool' created
#查看创建的存储池
cephadmin@ceph-deploy:~/ceph-cluster$ ceph osd pool ls
device_health_metrics
mypool
cephadmin@ceph-deploy:~/ceph-cluster$ rados lspools
device_health_metrics
mypool
#查看pg
cephadmin@ceph-deploy:~/ceph-cluster$ ceph pg ls-by-pool mypool|awk '{print $1,$2,$15}'
PG OBJECTS ACTING
2.0 0 [8,10,3]p8 #同一份数据存放在磁盘8/10/3中,8为主进行读写
2.1 0 [15,0,13]p15
2.2 0 [5,1,15]p5
2.3 0 [17,5,14]p17
2.4 0 [1,12,18]p1
2.5 0 [12,4,8]p12
2.6 0 [1,13,19]p1
2.7 0 [6,17,2]p6
2.8 0 [16,13,0]p16
2.9 0 [4,9,19]p4
2.a 0 [11,4,18]p11
2.b 0 [13,7,17]p13
2.c 0 [12,0,5]p12
2.d 0 [12,19,3]p12
2.e 0 [2,13,19]p2
2.f 0 [11,17,8]p11
2.10 0 [15,13,0]p15
2.11 0 [16,6,1]p16
2.12 0 [10,3,9]p10
2.13 0 [17,6,3]p17
2.14 0 [8,13,17]p8
2.15 0 [19,1,11]p19
2.16 0 [8,12,17]p8
2.17 0 [6,14,2]p6
2.18 0 [18,9,12]p18
2.19 0 [3,6,13]p3
2.1a 0 [6,14,2]p6
2.1b 0 [11,7,17]p11
2.1c 0 [10,7,1]p10
2.1d 0 [15,10,7]p15
2.1e 0 [3,13,15]p3
2.1f 0 [4,7,14]p4
* NOTE: afterwards
#上传文件
cephadmin@ceph-deploy:~/ceph-cluster$ sudo rados put msg1 /var/log/syslog --pool=mypool #把 syslog 文件上传到 mypool 并指定对象 id 为 msg1
#列出文件
cephadmin@ceph-deploy:~/ceph-cluster$ rados ls --pool=mypool
msg1
#文件信息
cephadmin@ceph-deploy:~/ceph-cluster$ ceph osd map mypool msg1
osdmap e113 pool 'mypool' (2) object 'msg1' -> pg 2.c833d430 (2.10) -> up ([15,13,0], p15) acting ([15,13,0], p15)
#下载文件
cephadmin@ceph-deploy:~/ceph-cluster$ sudo rados get msg1 --pool=mypool /opt/my.txt
cephadmin@ceph-deploy:~/ceph-cluster$ ll /opt/my.txt
-rw-r--r-- 1 root root 1606105 Oct 23 10:45 /opt/my.txt
#修改文件
cephadmin@ceph-deploy:~/ceph-cluster$ sudo rados put msg1 /etc/passwd --pool=mypool
cephadmin@ceph-deploy:~/ceph-cluster$ sudo rados get msg1 --pool=mypool /opt/1.txt
cephadmin@ceph-deploy:~/ceph-cluster$ tail /opt/1.txt
tcpdump:x:108:113::/nonexistent:/usr/sbin/nologin
landscape:x:109:115::/var/lib/landscape:/usr/sbin/nologin
pollinate:x:110:1::/var/cache/pollinate:/bin/false
usbmux:x:111:46:usbmux daemon,,,:/var/lib/usbmux:/usr/sbin/nologin
sshd:x:112:65534::/run/sshd:/usr/sbin/nologin
systemd-coredump:x:999:999:systemd Core Dumper:/:/usr/sbin/nologin
king:x:1000:1000:king:/home/king:/bin/bash
lxd:x:998:100::/var/snap/lxd/common/lxd:/bin/false
cephadmin:x:2022:2022::/home/cephadmin:/bin/bash
ceph:x:64045:64045:Ceph storage service:/var/lib/ceph:/usr/sbin/nologin
#删除文件
cephadmin@ceph-deploy:~/ceph-cluster$ sudo rados rm msg1 --pool=mypool
cephadmin@ceph-deploy:~/ceph-cluster$ rados ls --pool=mypool