在 full-nat two-arm 模式下,后端 real server 获取到请求的来源都是 dpvs local ip, 如何获取真实的 client ip 呢?这就需要 toa 模块,原理都说是修改了 rs 机器获取 ip 的函数,具体如何初现呢?
tcp option 字段
关于 tcp header 可以参考 wiki, 我把截图贴上来
我们知道 ip header 里 src address 肯定是 dpvs local ip, 否则数据包无法发送。那么 client ip 放哪里呢?就是在 tcp header 的 option 字段中。
option 字段最长 40 bytes. 每填充一个选项由三部分构成:op-kind, op-length, op-data. 最常用的 mss 字段就是放在 option 里。只要构建一个不冲突的 op-kind 就可以把 client ip 填充进去。ipv4 的度度是 4 bytes, ipv6 是 16 bytes. 看来整个 option 字段在不久就会不够用。
dpvs 写 tcp option address
DPVS fullnat 在调用 tcp_fnat_in_handler
时会调用 tcp_in_add_toa
写到 mbuf.
static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf,
struct tcphdr *tcph)
{
uint32_t mtu;
struct tcpopt_addr *toa;
uint32_t tcp_opt_len;
uint8_t *p, *q, *tail;
struct route_entry *rt;
if (unlikely(conn->af != AF_INET && conn->af != AF_INET6))
return EDPVS_NOTSUPP;
tcp_opt_len = conn->af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR;
/*
* check if we can add the new option
*/
/* skb length and tcp option length checking */
if ((rt = mbuf->userdata) != NULL) {
mtu = rt->mtu;
} else if (conn->in_dev) { /* no route for fast-xmit */
mtu = conn->in_dev->mtu;
} else {
RTE_LOG(DEBUG, IPVS, "add toa: MTU unknown.\n");
return EDPVS_NOROUTE;
}
if (unlikely(mbuf->pkt_len > (mtu - tcp_opt_len))) {
RTE_LOG(DEBUG, IPVS, "add toa: need fragment, tcp opt len : %u.\n",
tcp_opt_len);
return EDPVS_FRAG;
}
/* maximum TCP header is 60, and 40 for options */
if (unlikely((60 - (tcph->doff << 2)) < tcp_opt_len)) {
RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room, tcp opt len : %u.\n",
tcp_opt_len);
return EDPVS_NOROOM;
}
/* check tail room and expand mbuf.
* have to pull all bits in segments for later operation. */
if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0))
return EDPVS_INVPKT;
tail = (uint8_t *)rte_pktmbuf_append(mbuf, tcp_opt_len);
if (unlikely(!tail)) {
RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room, tcp opt len : %u.\n",
tcp_opt_len);
return EDPVS_NOROOM;
}
/*
* now add address option
*/
/* move data down, including existing tcp options
* @p is last data byte,
* @q is new position of last data byte */
p = tail - 1;
q = p + tcp_opt_len;
while (p >= ((uint8_t *)tcph + sizeof(struct tcphdr))) {
*q = *p;
p--, q--;
}
/* insert toa right after TCP basic header */
toa = (struct tcpopt_addr *)(tcph + 1);
toa->opcode = TCP_OPT_ADDR;
toa->opsize = tcp_opt_len;
toa->port = conn->cport;
if (conn->af == AF_INET) {
struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1);
toa_ip4->addr = conn->caddr.in;
}
else {
struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1);
toa_ip6->addr = conn->caddr.in6;
}
/* reset tcp header length */
tcph->doff += tcp_opt_len >> 2;
/* reset ip header total length */
if (conn->af == AF_INET)
ip4_hdr(mbuf)->total_length =
htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len);
else
ip6_hdr(mbuf)->ip6_plen =
htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len);
/* tcp csum will be recalc later,
* so as IP hdr csum since iph.tot_len has been chagned. */
return EDPVS_OK;
}
- 根据 ipv4 ipv6 来确定 toa 需要的长度,2 bytes op-kind, 2 bytes op-length 再加上地址长度。所以 ipv4 共需 8 bytes, ipv6 共需 20 bytes
- TCP header 最大长度 60,option 最大长度 40,确何不会超过
-
rte_pktmbuf_append
将 mbuf 扩展空间,能容纳 toa - 填充 tcpopt_addr 结构体,op-kind TCP_OPT_ADDR 是 254,非官方 tcp/ip 认可的值。端口值是 conn->cport, 最后填充 conn->caddr.in 或 conn->caddr.in6 地址。
real server 安装 toa
很简单,make 编绎后生成 toa.ko 驱动,然后 insmod toa.ko 即可。所有 real server 都需要安装。先看下 module_init 函数 toa_init
static int __init
toa_init(void)
{
TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n");
/* alloc statistics array for toa */
ext_stats = alloc_percpu(struct toa_stat_mib);
if (NULL == ext_stats)
return 1;
proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops);
/* get the address of function sock_def_readable
* so later we can know whether the sock is for rpc, tux or others
*/
sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable");
TOA_INFO("CPU [%u] sk_data_ready_addr = "
"kallsyms_lookup_name(sock_def_readable) = %lu\n",
smp_processor_id(), sk_data_ready_addr);
if (0 == sk_data_ready_addr) {
TOA_INFO("cannot find sock_def_readable.\n");
goto err;
}
#ifdef TOA_IPV6_ENABLE
if (0 != get_kernel_ipv6_symbol()) {
TOA_INFO("get ipv6 struct from kernel fail.\n");
goto err;
}
#endif
/* hook funcs for parse and get toa */
hook_toa_functions();
TOA_INFO("toa loaded\n");
return 0;
err:
proc_net_remove(&init_net, "toa_stats");
if (NULL != ext_stats) {
free_percpu(ext_stats);
ext_stats = NULL;
}
return 1;
}
-
proc_net_fops_create
在 /proc 文件系统下注册 /proc/net/toa_stats 用于查看统计使用 -
kallsyms_lookup_name
根据名称来获取 sock_def_readable 地址 -
get_kernel_ipv6_symbol
如果支持 ipv6, 获取相应的回调函数地址 -
hook_toa_functions
将 toa 功能 hook 进内核
/* replace the functions with our functions */
static inline int
hook_toa_functions(void)
{
/* hook inet_getname for ipv4 */
struct proto_ops *inet_stream_ops_p =
(struct proto_ops *)&inet_stream_ops;
/* hook tcp_v4_syn_recv_sock for ipv4 */
struct inet_connection_sock_af_ops *ipv4_specific_p =
(struct inet_connection_sock_af_ops *)&ipv4_specific;
inet_stream_ops_p->getname = inet_getname_toa;
TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n",
smp_processor_id(), inet_getname, inet_stream_ops_p->getname);
ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa;
TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n",
smp_processor_id(), tcp_v4_syn_recv_sock,
ipv4_specific_p->syn_recv_sock);
#ifdef TOA_IPV6_ENABLE
inet6_stream_ops_p->getname = inet6_getname_toa;
TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n",
smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname);
ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa;
TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n",
smp_processor_id(), tcp_v6_syn_recv_sock_org_pt,
ipv6_specific_p->syn_recv_sock);
#endif
return 0;
}
仔细看看也不难,就是将 inet ops 回调函数 getname 替换为 toa 的。但是我有问题,如果请求不来自 dpvs,普通的请求会不会也受影响?
可以看到 hook 了两个函数 tcp_v4_syn_recv_sock_toa 和 inet_getname_toa
real server 获取 client ip
当完成三次握手时调用 tcp_v4_syn_recv_sock_toa
static struct sock *
tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb,
struct request_sock *req, struct dst_entry *dst)
{
struct sock *newsock = NULL;
TOA_DBG("tcp_v4_syn_recv_sock_toa called\n");
/* call orginal one */
newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst);
/* set our value if need */
if (NULL != newsock && NULL == newsock->sk_user_data) {
newsock->sk_user_data = get_toa_data(AF_INET, skb);
if (NULL != newsock->sk_user_data)
TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT);
else
TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT);
TOA_DBG("tcp_v4_syn_recv_sock_toa: set "
"sk->sk_user_data to %p\n",
newsock->sk_user_data);
}
return newsock;
}
- 调用原有函数
tcp_v4_syn_recv_sock
处理,也就是就里兼容了原有逻辑,普通非 toa 请求也会正常获取到 ip - 额外调用
get_toa_data
生成地址,可以看到地址放到了 sk->sk_user_data 字段。
static void *get_toa_data(int af, struct sk_buff *skb)
{
struct tcphdr *th;
int length;
unsigned char *ptr;
TOA_DBG("get_toa_data called\n");
if (NULL != skb) {
th = tcp_hdr(skb);
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (unsigned char *) (th + 1);
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
return NULL;
if (opsize > length)
/* don't parse partial options */
return NULL;
if (TCPOPT_TOA == opcode &&
TCPOLEN_IP4_TOA == opsize) {
struct toa_ip4_data tdata;
void *ret_ptr = NULL;
memcpy(&tdata, ptr - 2, sizeof(tdata));
TOA_DBG("af = %d, find toa data: ip = "
TOA_NIPQUAD_FMT", port = %u\n",
af,
TOA_NIPQUAD(tdata.ip),
ntohs(tdata.port));
if (af == AF_INET) {
memcpy(&ret_ptr, &tdata,
sizeof(ret_ptr));
TOA_DBG("coded ip4 toa data: %p\n",
ret_ptr);
return ret_ptr;
}
#ifdef TOA_IPV6_ENABLE
else if (af == AF_INET6) {
struct toa_ip6_data *ptr_toa_ip6 =
kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
if (!ptr_toa_ip6) {
return NULL;
}
ptr_toa_ip6->opcode = opcode;
ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA;
ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0,
htonl(0x0000FFFF), tdata.ip);
TOA_DBG("coded ip6 toa data: %p\n",
ptr_toa_ip6);
TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
return ptr_toa_ip6;
}
#endif
}
#ifdef TOA_IPV6_ENABLE
if (TCPOPT_TOA == opcode &&
TCPOLEN_IP6_TOA == opsize &&
af == AF_INET6) {
struct toa_ip6_data *ptr_toa_ip6 =
kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
if (!ptr_toa_ip6) {
return NULL;
}
memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data));
TOA_DBG("find toa_v6 data : ip = "
TOA_NIP6_FMT", port = %u,"
" coded ip6 toa data: %p\n",
TOA_NIP6(ptr_toa_ip6->in6_addr),
ptr_toa_ip6->port,
ptr_toa_ip6);
TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
return ptr_toa_ip6;
}
#endif
ptr += opsize - 2;
length -= opsize;
}
}
}
return NULL;
}
- 遍历所有 option, 根据 opcode 来处理 ipv4 或是 ipv6
- 将 toa struct 复制一份,然后返回
然后当 real server 调用 getpeername 或是 getsocketname 时调用 inet_getname_toa
来获取 ip,如果是 ipv6 则调用 inet6_getname_toa
inet_getname_toa(struct socket *sock, struct sockaddr *uaddr,
int *uaddr_len, int peer)
{
int retval = 0;
struct sock *sk = sock->sk;
struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
struct toa_ip4_data tdata;
TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n",
sk->sk_user_data);
/* call orginal one */
retval = inet_getname(sock, uaddr, uaddr_len, peer);
/* set our value if need */
if (retval == 0 && NULL != sk->sk_user_data && peer) {
if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) {
memcpy(&tdata, &sk->sk_user_data, sizeof(tdata));
if (TCPOPT_TOA == tdata.opcode &&
TCPOLEN_IP4_TOA == tdata.opsize) {
TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
TOA_DBG("inet_getname_toa: set new sockaddr, ip "
TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT
", port %u -> %u\n",
TOA_NIPQUAD(sin->sin_addr.s_addr),
TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port),
ntohs(tdata.port));
sin->sin_port = tdata.port;
sin->sin_addr.s_addr = tdata.ip;
} else { /* sk_user_data doesn't belong to us */
TOA_INC_STATS(ext_stats,
GETNAME_TOA_MISMATCH_CNT);
TOA_DBG("inet_getname_toa: invalid toa data, "
"ip "TOA_NIPQUAD_FMT" port %u opcode %u "
"opsize %u\n",
TOA_NIPQUAD(tdata.ip), ntohs(tdata.port),
tdata.opcode, tdata.opsize);
}
} else {
TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT);
}
} else { /* no need to get client ip */
TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT);
}
return retval;
}
- 调用原有
inet_getname
函数,获取 ip,兼容原有内核逻辑 - 判断 sk_user_data 不为空,并且结构体 op-kind op-length 与 ipv4 toa 的相等,获取 ip port ,并填充 sin
小结
实现原理还真简单,只不过有两个隐患。
- 如果 option 以后扩充其它内容,长度不够咋办?资源本身就不多
- op-kind 254 现在不被 tcp/ip 官方认可,以后会不会被占用?