在 Linux 中程序发生了段错误(Segmentation Fault)等异常,内核通过保存现场信息到 core dump 文件来帮助开发人员调试程序问题。
Enable core dump
Node
template/delete_core_dump_file.sh
#!/usr/bin/env bash
##
# core dump文件删除条件:
# 文件名以'core'为前缀 and (文件创建时间30天以前 or 文件大小超过20G)
##
#set -x
set -euo pipefail
core_dump_dirs=("{{dir_core_dump}}" "/tmp/core")
for work_dir in ${core_dump_dirs[@]}
do
if [ ! -d $work_dir ]; then continue; fi
cd $work_dir
echo "Work directory: $(pwd)"
echo "delete core dump file 30 days ago"
find -mtime +30 -type f -name "core*"
find -mtime +30 -type f -name "core*" | xargs sudo rm -f
echo "delete core dump file greater than 20G"
find -size +20G -name "core*"
find -size +20G -name "core*" | xargs sudo rm -f
done
enable_core_dump_node.yaml
- name: Enable core dump in node
hosts: debug
vars:
dir_core_dump: '/tmp/core'
tasks:
- name: Modify soft core in /etc/security/limits.conf
lineinfile:
path: '/etc/security/limits.conf'
regexp: '\*.*soft.*core.*'
line: '* soft core unlimited'
become: true
- name: Modify hard core in /etc/security/limits.conf
lineinfile:
path: '/etc/security/limits.conf'
regexp: '\*.*hard.*core.*'
line: '* hard core unlimited'
become: true
- name: Create directory for core dump file
file:
path: '{{item}}'
state: directory
with_items:
- '{{dir_core_dump}}'
- name: Append kernel.core_pattern to /etc/sysctl.conf
lineinfile:
path: '/etc/sysctl.conf'
insertafter: EOF
line: '{{item}}'
become: true
with_items:
- '# coredump文件格式'
- 'kernel.core_pattern = {{dir_core_dump}}/core.%h.%e.%p.%t'
- name: Make sysctl.conf effect
shell: sysctl -p
become: true
- name: Transport script to {{dir_core_dump}}
template:
src: template/{{item}}
dest: "{{dir_core_dump}}/"
mode: "0755"
with_items:
- delete_core_dump_file.sh
- name: Add a cronjob to delete coredump file
cron:
name: Delete Core Dump Files
minute: '*/13'
job: '{{dir_core_dump}}/delete_core_dump_file.sh'
Docker
enable_core_dump_docker.yaml
- name: Enable core dump in node
hosts: debug
serial: 1
tasks:
- name: Modify default-ulimits.core.Hard/Soft in /etc/docker/daemon.json
replace:
path: /etc/docker/daemon.json
regexp: '{{item.key}}'
replace: '{{item.value}}'
become: true
with_items:
- {key: '"Hard": 0', value: '"Hard": -1'}
- {key: '"Soft": 0', value: '"Soft": -1'}
- name: reload docker service
systemd:
state: reloaded
name: docker
become: true
- name: restart docker service
systemd:
state: restarted
name: docker
become: true
- name: Check whether docker has been restarted
shell: |
if systemctl status docker | grep active > /dev/null;then
echo "success"
else
echo "failed"
fi
register: docker_state
failed_when: docker_state.stdout == "failed"
Install
# create scripts
mkdir -pv template
vim template/delete_core_dump_file.sh
vim enable_core_dump_node.yaml
vim enable_core_dump_docker.yaml
# config
ansible debug --list
ansible-playbook enable_core_dump_node.yaml -e "dir_core_dump=/data01/cores"
ansible-playbook enable_core_dump_docker.yaml
Demo of Segfault
Host
demo_segfault.c
#include <stdio.h>
int main() {
char *str = "Hello, world!";
str[0] = 'h';
printf("%s\n", str);
return 0;
}
vim demo_segfault.c
gcc demo_segfault.c -o demo_segfault
./demo_segfault
dmesg -T | grep segfault | grep demo
Container
Dockerfile
FROM python:3.9-buster
LABEL org='Personal' \
multi.author="Nick"
RUN sed -i 's/deb.debian.org/opentuna.cn/g' /etc/apt/sources.list \
&& sed -i 's/security.debian.org/opentuna.cn/g' /etc/apt/sources.list
RUN apt-get update \
&& apt-get install -y openssh-server openssh-client \
&& apt-get install -y wget curl tar \
&& apt-get clean
RUN mkdir -p /opt/demo
COPY ./demo_segfault /opt/demo
RUN chmod +x /opt/demo/demo_segfault
CMD ["/opt/demo/demo_segfault"]
- build
docker pull python:3.9-buster
vim Dockerfile
docker build -t demo_segfault:v0.1 .
- run
容器启动指令需挂载宿主机配置的core dump文件路径: sysctl kernel.core_pattern
core_dump_dir=$(cat /proc/sys/kernel/core_pattern | xargs dirname)
docker run -v $core_dump_dir:$core_dump_dir --name demo_segfault demo_segfault:v0.1
docker ps -a | grep demo_segfault
# docker run --rm -it -v $core_dump_dir:$core_dump_dir demo_segfault:v0.1 bash
ls -alh $core_dump_dir