关于iptables的的工作原理,主要分为三个方面:用户程序对规则的处理,内核对用户命令的处理,内核中netfilter对数据包的过滤(Ref:netfilter分析3-钩子函数执行流程)。
本文大致分析iptables用户态程序如何解析规则,并将规则配置到内核中。以如下命令为例:
iptables -A INPUT -i eth0 -p tcp -s 192.168.100.0/24 --dport 22 -m state --state NEW,ESTABLISHED -j ACCEPT
iptables -A OUTPUT -o eth0 -p tcp --sport 22 -m state --state ESTABLISHED -j ACCEPT
主要分析第一句:
iptables -A INPUT -i eth0 -p tcp -s 192.168.100.0/24 --dport 22 -m state --state NEW,ESTABLISHED -j ACCEPT
用户空间
代码版本:iptables-1.8.7。
iptables的客户端和内核共享一些数据结构。例如:
ipt_entry 、xt_entry_match、xt_tcp。
struct ipt_entry {
struct ipt_ip ip;
/* Mark with fields that we care about. */
unsigned int nfcache;
/* Size of ipt_entry + matches */
__u16 target_offset;
/* Size of ipt_entry + matches + target */
__u16 next_offset;
/* Back pointer */
unsigned int comefrom;
/* Packet and byte counters. */
struct xt_counters counters;
/* The matches (if any), then the target. */
unsigned char elems[0];
};
struct xt_entry_match {
union {
struct {
__u16 match_size;
/* Used by userspace */
char name[XT_EXTENSION_MAXNAMELEN];
__u8 revision;
} user;
struct {
__u16 match_size;
/* Used inside the kernel */
struct xt_match *match;
} kernel;
/* Total length */
__u16 match_size;
} u;
unsigned char data[0];
};
struct xt_tcp {
__u16 spts[2]; /* Source port range. */
__u16 dpts[2]; /* Destination port range. */
__u8 option; /* TCP Option iff non-zero*/
__u8 flg_mask; /* TCP flags mask byte */
__u8 flg_cmp; /* TCP flags compare byte */
__u8 invflags; /* Inverse flags */
};
主函数为iptables_main(iptables-standalone.c)。
int
iptables_main(int argc, char *argv[])
{
char *table = "filter";
struct xtc_handle *handle = NULL;
ret = do_command4(argc, argv, &table, &handle, false);
if (ret) {
ret = iptc_commit(handle);
iptc_free(handle);
}
}
-A INPUT的解析代码:
int do_command4(int argc, char *argv[], char **table,
struct xtc_handle **handle, bool restore)
{
case 'A':
add_command(&command, CMD_APPEND, CMD_NONE,
cs.invert);
chain = optarg;
break;
}
-i eth0的解析代码:
int do_command4(int argc, char *argv[], char **table,
struct xtc_handle **handle, bool restore)
{
case 'i':
if (*optarg == '\0')
xtables_error(PARAMETER_PROBLEM,
"Empty interface is likely to be "
"undesired");
set_option(&cs.options, OPT_VIANAMEIN, &cs.fw.ip.invflags,
cs.invert);
xtables_parse_interface(optarg,
cs.fw.ip.iniface,
cs.fw.ip.iniface_mask);
break;
}
-p tcp -s 192.168.100.0/24 --dport 22
ip段(192.168.100.0/24)的解析:
int do_command4(int argc, char *argv[], char **table,
struct xtc_handle **handle, bool restore)
{
if (shostnetworkmask)
xtables_ipparse_multiple(shostnetworkmask, &saddrs,
&smasks, &nsaddrs);
if (dhostnetworkmask)
xtables_ipparse_multiple(dhostnetworkmask, &daddrs,
&dmasks, &ndaddrs);
}
--dport 22的参数解析,需要tcp_match模块,命令中已经指定了协议(-p tcp)。
static struct xtables_match tcp_match = {
.family = NFPROTO_UNSPEC,
.name = "tcp",
.version = XTABLES_VERSION,
.size = XT_ALIGN(sizeof(struct xt_tcp)),
.userspacesize = XT_ALIGN(sizeof(struct xt_tcp)),
.help = tcp_help,
.init = tcp_init,
.parse = tcp_parse,
.print = tcp_print,
.save = tcp_save,
.extra_opts = tcp_opts,
.xlate = tcp_xlate,
};
相应的解析函数:
int command_default(struct iptables_command_state *cs,
struct xtables_globals *gl)
{
if (cs->target != NULL &&
(cs->target->parse != NULL || cs->target->x6_parse != NULL) &&
cs->c >= cs->target->option_offset &&
cs->c < cs->target->option_offset + XT_OPTION_OFFSET_SCALE) {
xtables_option_tpcall(cs->c, cs->argv, cs->invert,
cs->target, &cs->fw);
return 0;
}
for (matchp = cs->matches; matchp; matchp = matchp->next) {
m = matchp->match;
if (matchp->completed ||
(m->x6_parse == NULL && m->parse == NULL))
continue;
if (cs->c < matchp->match->option_offset ||
cs->c >= matchp->match->option_offset + XT_OPTION_OFFSET_SCALE)
continue;
xtables_option_mpcall(cs->c, cs->argv, cs->invert, m, &cs->fw);
return 0;
}
/* Try loading protocol */
m = load_proto(cs);
if (m != NULL) {
size_t size;
cs->proto_used = 1;
size = XT_ALIGN(sizeof(struct xt_entry_match)) + m->size;
m->m = xtables_calloc(1, size);
m->m->u.match_size = size;
strcpy(m->m->u.user.name, m->name);
m->m->u.user.revision = m->revision;
xs_init_match(m);
if (m->x6_options != NULL)
gl->opts = xtables_options_xfrm(gl->orig_opts,
gl->opts,
m->x6_options,
&m->option_offset);
else
gl->opts = xtables_merge_options(gl->orig_opts,
gl->opts,
m->extra_opts,
&m->option_offset);
if (gl->opts == NULL)
xtables_error(OTHER_PROBLEM, "can't alloc memory!");
optind--;
/* Indicate to rerun getopt *immediately* */
return 1;
}
}
void xtables_option_mpcall(unsigned int c, char **argv, bool invert,
struct xtables_match *m, void *fw)
{
if (m->x6_parse == NULL) {
if (m->parse != NULL)
m->parse(c - m->option_offset, argv, invert,
&m->mflags, fw, &m->m);
return;
}
}
tcp_parse会将端口数据写入struct xt_tcp中。
load_proto中会加载按照protocol寻找对应的xtables_match。
struct xtables_match *load_proto(struct iptables_command_state *cs)
{
if (!should_load_proto(cs))
return NULL;
return find_proto(cs->protocol, XTF_TRY_LOAD,
cs->options & OPT_NUMERIC, &cs->matches);
}
static struct xtables_match *
find_proto(const char *pname, enum xtables_tryload tryload,
int nolookup, struct xtables_rule_match **matches)
{
return xtables_find_match(pname, tryload, matches);
}
命令行中的数据会加载到struct xt_entry_match。之后被复制到struct ipt_entry中。
static struct ipt_entry *
generate_entry(const struct ipt_entry *fw,
struct xtables_rule_match *matches,
struct xt_entry_target *target)
{
unsigned int size;
struct xtables_rule_match *matchp;
struct ipt_entry *e;
size = sizeof(struct ipt_entry);
for (matchp = matches; matchp; matchp = matchp->next)
size += matchp->match->m->u.match_size;
e = xtables_malloc(size + target->u.target_size);
*e = *fw;
e->target_offset = size;
e->next_offset = size + target->u.target_size;
size = 0;
for (matchp = matches; matchp; matchp = matchp->next) {
//复制match中的数据
memcpy(e->elems + size, matchp->match->m, matchp->match->m->u.match_size);
size += matchp->match->m->u.match_size;
}
memcpy(e->elems + size, target, target->u.target_size);
return e;
}
数据复制。
static int
append_entry(const xt_chainlabel chain,
struct ipt_entry *fw,
unsigned int nsaddrs,
const struct in_addr saddrs[],
const struct in_addr smasks[],
unsigned int ndaddrs,
const struct in_addr daddrs[],
const struct in_addr dmasks[],
int verbose,
struct xtc_handle *handle)
{
for (i = 0; i < nsaddrs; i++) {
fw->ip.src.s_addr = saddrs[i].s_addr;
fw->ip.smsk.s_addr = smasks[i].s_addr;
for (j = 0; j < ndaddrs; j++) {
fw->ip.dst.s_addr = daddrs[j].s_addr;
fw->ip.dmsk.s_addr = dmasks[j].s_addr;
if (verbose)
print_firewall_line(fw, handle);
ret &= iptc_append_entry(chain, fw, handle);
}
}
return ret;
}
iptc_append_entry(const IPT_CHAINLABEL chain,
const STRUCT_ENTRY *e,
struct xtc_handle *handle)
{
if (!(r = iptcc_alloc_rule(c, e->next_offset))) {
DEBUGP("unable to allocate rule for chain `%s'\n", chain);
errno = ENOMEM;
return 0;
}
memcpy(r->entry, e, e->next_offset);
}
/* allocate and initialize a new rule for the cache */
static struct rule_head *iptcc_alloc_rule(struct chain_head *c, unsigned int size)
{
r->chain = c;
r->size = size;
return r;
}
解析action,-j ACCEPT。
int do_command4(int argc, char *argv[], char **table,
struct xtc_handle **handle, bool restore)
{
case 'j':
set_option(&cs.options, OPT_JUMP, &cs.fw.ip.invflags,
cs.invert);
command_jump(&cs, optarg);
break;
}
void command_jump(struct iptables_command_state *cs, const char *jumpto)
{
cs->jumpto = xt_parse_target(jumpto);
/* TRY_LOAD (may be chain name) */
cs->target = xtables_find_target(cs->jumpto, XTF_TRY_LOAD);
if (cs->target == NULL)
return;
size = XT_ALIGN(sizeof(struct xt_entry_target)) + cs->target->size;
cs->target->t = xtables_calloc(1, size);
cs->target->t->u.target_size = size;
}
ACCEPT,DROP,QUEUE,RETURN对应的是standard target。
static struct xtables_target standard_target = {
.family = NFPROTO_UNSPEC,
.name = "standard",
.version = XTABLES_VERSION,
.size = XT_ALIGN(sizeof(int)),
.userspacesize = XT_ALIGN(sizeof(int)),
.help = standard_help,
};
xt_entry_target分配的大小:
size = XT_ALIGN(sizeof(struct xt_entry_target)) + cs->target->size;
cs->target->t = xtables_calloc(1, size);
standard target的target->size大小为XT_ALIGN(sizeof(int))。最终分配的结构体为xt_standard_target 。
struct xt_standard_target {
struct xt_entry_target target;
int verdict;
};
整理成内核需要的格式,向内核提交:
int
TC_COMMIT(struct xtc_handle *handle)
{
/* Replace, then map back the counters. */
STRUCT_REPLACE *repl;
new_number = iptcc_compile_table_prep(handle, &new_size);
ret = iptcc_compile_table(handle, repl);
ret = setsockopt(handle->sockfd, TC_IPPROTO, SO_SET_REPLACE, repl,
sizeof(*repl) + repl->size);
}
内核空间
static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
switch (cmd) {
case IPT_SO_SET_REPLACE:
ret = do_replace(sock_net(sk), user, len);
break;
default:
ret = -EINVAL;
}
return ret;
}
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
}
static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table_info *newinfo, unsigned int num_counters,
void __user *counters_ptr)
{
struct xt_table *t;
t = xt_request_find_table_lock(net, AF_INET, name);
oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
}
struct xt_table_info *
xt_replace_table(struct xt_table *table,
unsigned int num_counters,
struct xt_table_info *newinfo,
int *error)
{
table->private = newinfo;
}
原有规则的处理
用户层调用setsockopt将数据配置到内核。do_replace函数会重新配置规则。但是用户可以多次配置iptable。这里就引入一个问题:之前内核中的iptables规到哪里去了呢?难道被冲掉了吗?
iptables在重新解析规则时,会调用getsockopt将内核中的规则拷贝出来,然后重新配置。
int do_command4(int argc, char *argv[], char **table,
struct xtc_handle **handle, bool restore)
{
/* only allocate handle if we weren't called with a handle */
if (!*handle)
*handle = iptc_init(*table);
}
struct xtc_handle *
iptc_init(const char *tablename)
{
strcpy(info.name, tablename);
//获取entry的大小信息。
if (getsockopt(sockfd, TC_IPPROTO, SO_GET_INFO, &info, &s) < 0) {
close(sockfd);
return NULL;
}
h = alloc_handle(&info);
/* Initialize current state */
h->sockfd = sockfd;
h->info = info;
h->entries->size = h->info.size;
tmp = sizeof(STRUCT_GET_ENTRIES) + h->info.size;
if (getsockopt(h->sockfd, TC_IPPROTO, SO_GET_ENTRIES, h->entries,
&tmp) < 0)
goto error;
}
getsockopt(h->sockfd, TC_IPPROTO,SO_GET_ENTRIES, h->entries, &tmp) 从内核中拷贝原有的entries。
内核中对应的处理函数:
static int
compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_GET_INFO:
ret = get_info(sock_net(sk), user, len, 1);
break;
case IPT_SO_GET_ENTRIES:
ret = compat_get_entries(sock_net(sk), user, len);
break;
default:
ret = do_ipt_get_ctl(sk, cmd, user, len);
}
return ret;
}
读取规则信息之后,iptables重新处理数据:
/* parse an iptables blob into it's pieces */
static int parse_table(struct xtc_handle *h)
{
/* First pass: over ruleset blob */
ENTRY_ITERATE(h->entries->entrytable, h->entries->size,
cache_add_entry, h, &prev, &num);
}
/* main parser function: add an entry from the blob to the cache */
static int cache_add_entry(STRUCT_ENTRY *e,
struct xtc_handle *h,
STRUCT_ENTRY **prev,
unsigned int *num)
{
else if ((builtin = iptcb_ent_is_hook_entry(e, h)) != 0) {
struct chain_head *c =
iptcc_alloc_chain_head((char *)hooknames[builtin-1],
builtin);
DEBUGP_C("%u:%u new builtin chain: %p (rules=%p)\n",
*num, offset, c, &c->rules);
if (!c) {
errno = -ENOMEM;
return -1;
}
c->hooknum = builtin;
__iptcc_p_add_chain(h, c, offset, num);
/* FIXME: this is ugly. */
goto new_rule;
}
}
内核中在初始化table的时候,会配置chain。博客——netfilter分析2-表在内核的初始化——有更详尽的分析。
以filter表为例:
static int __net_init iptable_filter_table_init(struct net *net)
{
repl = ipt_alloc_initial_table(&packet_filter);
}
void *ipt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ipt, IPT);
}
#define xt_alloc_initial_table(type, typ2) ({ \
struct { \
struct type##_replace repl; \
struct type##_standard entries[]; \
} *tbl; \
struct type##_error *term; \
size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
__alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
if (!(hook_mask & 1)) \
continue; \
tbl->repl.hook_entry[hooknum] = bytes; \
tbl->repl.underflow[hooknum] = bytes; \
tbl->entries[i++] = (struct type##_standard) \
typ2##_STANDARD_INIT(NF_ACCEPT); \
bytes += sizeof(struct type##_standard); \
} \
tbl; \
})
Reference:
[1]25个iptables常用示例
[2]Netfilter之AF_INET协议族rule、match、target
[3]iptables 防火墙-filter表
[4]Netfilter 是如何工作的(二):表(table)与规则(rule)
[5]Netfilter是如何工作的(三):扩展匹配条件和动作
[6]netfilter分析2-表在内核的初始化
[7]netfilter分析3-钩子函数执行流程
[8]iptables性能 -- Kube-proxy引入user define chain思考
[9]ipset详解 使用ipset提高iptables的控制效率