本篇介绍
bpf在网络中可以用来对网络包进行捕获和过滤,接下来看下是如何做到的。
Packet Filtering
Filtering 主要可以用于如下三个场景:
- 实时数据包丢弃
- 观察实时数据包
- 数据包分析,主要是pcap格式
tcpdump
tcpdump是流量分析和观察中常用的一个工具,实际上tcpdump也是从网络接口上读取数据并且将对应包的内容提供给我们,同时也可以使用pcap过滤语法进行过滤。
比如我们想看端口443(https)的数据,命令如下:
# tcpdump -n 'ip and tcp port 443'
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on wlp4s0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
13:06:23.124480 IP 192.168.31.63.35806 > 142.251.215.234.443: Flags [S], seq 2161845746, win 64240, options [mss 1460,sackOK,TS val 4275232445 ecr 0,nop,wscale 7], length 0
13:06:23.380465 IP 192.168.31.63.51158 > 23.216.153.92.443: Flags [.], ack 1565733590, win 456, options [nop,nop,TS val 4010187404 ecr 1943807953], length 0
13:06:23.380507 IP 192.168.31.63.52146 > 114.250.70.41.443: Flags [.], ack 3097876363, win 1257, options [nop,nop,TS val 3161263685 ecr 2071104007], length 0
13:06:23.385979 IP 114.250.70.41.443 > 192.168.31.63.52146: Flags [.], ack 1, win 285, options [nop,nop,TS val 2071152322 ecr 3161215372], length 0
13:06:23.448311 IP 23.216.153.92.443 > 192.168.31.63.51158: Flags [.], ack 1, win 501, options [nop,nop,TS val 1943856427 ecr 4010139037], length 0
13:06:24.014502 IP 114.250.65.34.443 > 192.168.31.63.34102: Flags [P.], seq 1667579479:1667579552, ack 2774828657, win 301, options [nop,nop,TS val 1564142983 ecr 3366717057], length 73
13:06:24.015094 IP 192.168.31.63.34102 > 114.250.65.34.443: Flags [F.], seq 1, ack 73, win 488, options [nop,nop,TS val 3366957171 ecr 1564142983], length 0
13:06:24.022775 IP 114.250.65.34.443 > 192.168.31.63.34102: Flags [F.], seq 73, ack 2, win 301, options [nop,nop,TS val 1564143099 ecr 3366957171], length 0
13:06:24.022857 IP 192.168.31.63.34102 > 114.250.65.34.443: Flags [.], ack 74, win 488, options [nop,nop,TS val 3366957179 ecr 1564143099], length 0
13:06:24.148403 IP 192.168.31.63.35806 > 142.251.215.234.443: Flags [S], seq 2161845746, win 64240, options [mss 1460,sackOK,TS val 4275233469 ecr 0,nop,wscale 7], length 0
13:06:26.196385 IP 192.168.31.63.35806 > 142.251.215.234.443: Flags [S], seq 2161845746, win 64240, options [mss 1460,sackOK,TS val 4275235517 ecr 0,nop,wscale 7], length 0
13:06:26.573162 IP 110.242.68.4.443 > 192.168.31.63.59242: Flags [.], ack 1765924645, win 1108, length 0
13:06:26.573216 IP 192.168.31.63.59242 > 110.242.68.4.443: Flags [.], ack 1, win 501, length 0
13:06:26.964926 IP 192.168.31.63.44972 > 142.251.215.234.443: Flags [S], seq 261160313, win 64240, options [mss 1460,sackOK,TS val 4275236285 ecr 0,nop,wscale 7], length 0
13:06:27.188234 IP 110.242.69.113.443 > 192.168.31.63.56366: Flags [P.], seq 1359899758:1359899789, ack 4057787616, win 1208, length 31
13:06:27.188236 IP 110.242.69.113.443 > 192.168.31.63.56366: Flags [F.], seq 31, ack 1, win 1208, length 0
13:06:27.188237 IP 110.242.69.113.443 > 192.168.31.63.56366: Flags [F.], seq 31, ack 1, win 1208, length 0
13:06:27.188343 IP 192.168.31.63.56366 > 110.242.69.113.443: Flags [.], ack 32, win 428, options [nop,nop,sack 1 {31:32}], length 0
上述指令介绍如下:
-n: 不需要转换地址,这样可以方便看dst和src的地址
ip and tcp port 8080: ip表示ipv4,tcp prot 8080: 表示tcp包,并且来自或者去端口8080的数据包
这儿我们用到了pcap filters,实际上pcap filter会被编译成bpf指令,并加载到系统中用来过滤包。也就是我们在使用tcpdump的时候实际上就是在加载并使用bpf程序,可以添加-d看下:
tcpdump -n -d 'ip and tcp port 443'
Warning: assuming Ethernet
(000) ldh [12] // 在偏移12的地方读取半字,也就是2个字节
(001) jeq #0x800 jt 2 jf 12 // 如果等于800, 就跳到2,否则到12,也就是直接推出
(002) ldb [23] // 在偏移23的地方读取一个字节,对应的是网络协议字段
(003) jeq #0x6 jt 4 jf 12 // 如果是6,就跳到4,否则就到12, 6表示的是tcp
(004) ldh [20] // 从偏移20的地方读取2字节,对应的是
(005) jset #0x1fff jt 12 jf 6 // 查看最后13字节,如果包含1,则跳到12,否则就到6
(006) ldxb 4*([14]&0xf) // 从偏移14的地方读取一个字节,也就是 IP header length
(007) ldh [x + 14] // 从x +14的地方读取2个字节,也就是源端口号
(008) jeq #0x1bb jt 11 jf 9 // 如果是443,就跳到11,否则跳到9
(009) ldh [x + 16] // 从x + 16 的地方读取2个字节,也就是目的端口号
(010) jeq #0x1bb jt 11 jf 12 // 如果是443,就跳到11,否则跳到12
(011) ret #262144 // 满足查询条件,返回对应的长度,这儿是个默认值
(012) ret #0 // 不满足查询条件
如果需要看懂上述的指令,需要先了解下网络包格式:
Destination Mac: 6 bytes
Source Mac: 6 bytes
Ethertype: 2 bytes
Data(payload):45-1500 bytes
Frame check sequence(CRC): 4 bytes
基于裸socket的包过滤
接下来用代码来看看如何将bpf程序attach到一个裸socket上,此时该socket收到的所有数据都会移交给我们的bpf程序,这时候我们的程序就可以决定是否需要丢弃。接下来的例子只是统计下该socket上不同协议的报文数量,代码如下:
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/string.h>
#include <linux/tcp.h>
#include <linux/types.h>
#include <linux/udp.h>
#ifndef offsetof
#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER)
#endif
#define SEC(NAME) __attribute__((section(NAME), used))
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
unsigned int map_flags;
};
static int (*bpf_map_update_elem)(struct bpf_map_def *map, void *key,
void *value, __u64 flags) = (void *)
BPF_FUNC_map_update_elem;
static void *(*bpf_map_lookup_elem)(struct bpf_map_def *map, void *key) =
(void *)BPF_FUNC_map_lookup_elem;
unsigned long long load_byte(void *skb,
unsigned long long off) asm("llvm.bpf.load.byte");
// 注意,该方式在libbpf v1.0 + 上不再支持了
struct bpf_map_def SEC("maps") countmap = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 256,
};
SEC("socket")
int socket_prog(struct __sk_buff *skb) {
int proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
int one = 1;
int *el = bpf_map_lookup_elem(&countmap, &proto);
if (el) {
(*el)++;
} else {
el = &one;
}
bpf_map_update_elem(&countmap, &proto, el, BPF_ANY);
return 0;
}
char _license[] SEC("license") = "GPL";
该代码需要在内核代码树中编译,并且编译target是bpf。
接下来是加载的代码:
#include <arpa/inet.h>
#include <assert.h>
#include <bpf/bpf.h>
#include <bpf/bpf_load.h>
#include <bpf/sock_example.h>
#include <errno.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>
char bpf_log_buf[BPF_LOG_BUF_SIZE];
int main(int argc, char **argv) {
int sock = -1, i, key;
int tcp_cnt, udp_cnt, icmp_cnt;
char filename[256];
snprintf(filename, sizeof(filename), "%s", argv[1]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
sock = open_raw_sock("lo");
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
sizeof(prog_fd[0]))) {
printf("setsockopt %s\n", strerror(errno));
return 0;
}
for (i = 0; i < 10; i++) {
key = IPPROTO_TCP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
key = IPPROTO_UDP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
key = IPPROTO_ICMP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
printf("TCP %d UDP %d ICMP %d packets\n", tcp_cnt, udp_cnt, icmp_cnt);
sleep(1);
}
}
按照如上操作后,如果执行ping 127.0.0.1, 就会得到如下结果:
TCP 0 UDP 0 ICMP 0 packets
TCP 0 UDP 0 ICMP 4 packets
TCP 0 UDP 0 ICMP 8 packets
TCP 0 UDP 0 ICMP 12 packets
TCP 0 UDP 0 ICMP 16 packets
TCP 0 UDP 0 ICMP 20 packets
TCP 0 UDP 0 ICMP 24 packets
TCP 0 UDP 0 ICMP 28 packets
TCP 0 UDP 0 ICMP 32 packets
TCP 0 UDP 0 ICMP 36 packets
基于BPF 的TC
TC(traffic control) 是内核包调度子系统,可以决定包如何流动和如何被接收。
为了了解tc,可以先了解几个术语。
Queueing disciplines:qdisc, 用来决定数据包在网络接口上的发送顺序,默认的是pfifo_fast, 拥有3个优先级队列,按照优先级先进先出。用如下命令就可以看到当前设备上的qdisc:
ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host noprefixroute
valid_lft forever preferred_lft forever
2: enp0s31f6: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc fq_codel state DOWN group default qlen 1000
link/ether c8:5b:76:3f:0e:74 brd ff:ff:ff:ff:ff:ff
3: wlp4s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether e4:a4:71:b3:62:17 brd ff:ff:ff:ff:ff:ff
inet 192.168.0.112/24 brd 192.168.0.255 scope global dynamic noprefixroute wlp4s0
valid_lft 5077sec preferred_lft 5077sec
inet6 fe80::a7fa:5c7f:99ae:bd31/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
link/ether 02:42:c0:45:54:ec brd ff:ff:ff:ff:ff:ff
inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0
valid_lft forever preferred_lft forever
可以看到如上用到的主要是noqueue和fq_codel,前者就是收到包后就立即发,没有其他规则,后者是fair queue controlled delay,以一个随机模型分类数据包,实现公平发送。
Classful qdiscs: 允许为不同的包定义不同的类别,这样就可以使用不同的规则。
Filters:用来给数据包制定一个特定的类别。
Classless qdiscs:没有任何关联的类别,这意味着没法指定filters。
接下来就看一个例子, 如果发现是http包,就打印下log:
#pragma clang diagnostic ignored "-Wcompare-distinct-pointer-types"
#include <bits/types.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/pkt_cls.h>
#include <linux/tcp.h>
#define SEC(NAME) __attribute__((section(NAME), used))
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define __bpf_htons(x) __builtin_bswap16(x)
#define __bpf_constant_htons(x) ___constant_swab16(x)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define __bpf_htons(x) (x)
#define __bpf_constant_htons(x) (x)
#else
#error "Fix your compiler's __BYTE_ORDER__?!"
#endif
#define bpf_htons(x) \
(__builtin_constant_p(x) ? __bpf_constant_htons(x) : __bpf_htons(x))
static int (*bpf_trace_printk)(const char *fmt, int fmt_size,
...) = (void *)BPF_FUNC_trace_printk;
#define trace_printk(fmt, ...) \
do { \
char _fmt[] = fmt; \
bpf_trace_printk(_fmt, sizeof(_fmt), ##__VA_ARGS__); \
} while (0)
unsigned long long load_byte(void *skb,
unsigned long long off) asm("llvm.bpf.load.byte");
struct http_payload {
int method;
};
static inline int is_http(struct __sk_buff *skb, __u64 nh_off);
typedef __uint8_t uint8_t;
typedef __uint16_t uint16_t;
typedef __uint32_t uint32_t;
typedef __uint64_t uint64_t;
SEC("classifier")
static inline int classification(struct __sk_buff *skb) {
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
struct ethhdr *eth = data;
__u16 h_proto;
__u64 nh_off = 0;
nh_off = sizeof(*eth);
if (data + nh_off > data_end) {
return TC_ACT_OK;
}
h_proto = eth->h_proto;
if (h_proto == bpf_htons(ETH_P_IP)) {
if (is_http(skb, nh_off) == 1) {
trace_printk("Yes! It is HTTP!\n");
}
}
return TC_ACT_OK;
}
static inline int is_http(struct __sk_buff *skb, __u64 nh_off) {
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
struct iphdr *iph = data + nh_off;
if (iph + 1 > data_end) {
return 0;
}
if (iph->protocol != IPPROTO_TCP) {
return 0;
}
__u32 tcp_hlen = 0;
__u32 ip_hlen = 0;
__u32 poffset = 0;
__u32 plength = 0;
__u32 ip_total_length = iph->tot_len;
ip_hlen = iph->ihl << 2;
if (ip_hlen < sizeof(*iph)) {
return 0;
}
struct tcphdr *tcph = data + nh_off + sizeof(*iph);
if (tcph + 1 > data_end) {
return 0;
}
tcp_hlen = tcph->doff << 2;
poffset = ETH_HLEN + ip_hlen + tcp_hlen;
plength = ip_total_length - ip_hlen - tcp_hlen;
if (plength >= 7) {
unsigned long p[7];
int i = 0;
for (i = 0; i < 7; i++) {
p[i] = load_byte(skb, poffset + i);
}
int *value;
if ((p[0] == 'H') && (p[1] == 'T') && (p[2] == 'T') && (p[3] == 'P')) {
return 1;
}
}
return 0;
}
char _license[] SEC("license") = "GPL";
将上面文件编译成bpf的目标文件。
使用如下命令进行加载:
# tc qdisc add dev eth0 handle 0: ingress
# tc filter add dev eth0 ingress bpf obj classifier.o flowid 0:
# tc exec bpf dbg
如果有http数据,就会看到对应的打印。