本文主要分析在TCP拥塞状态机的实现中,函数tcp_fastretrans_alert()的实现,及对一些相关函数也做了介绍。
变量介绍
这些变量都在include/linux/tcp.h中声明,在net/ipv4/tcp.c中被赋初值。
u32 packets_out; /* 表示离开网络但没被确认的包 */
u32 sacked_out;
/* Packets, which arrived to receiver out of order and hence not ACKed.
* With SACK this number is simply amount of SACKed data. Even withou
* SACKs it is easy to give pretty reliable estimate of this number, counting
* duplicate ACKs.
* 上面是sacked_out的英文解释,其实应该分两种情况来看,开和没开SACK选项:
* 如果开了SACK选项,那么这个值无疑就是表示被SACK的乱序包的个数,
* 如果没开SACK选项,那么该值就是表示dupack的个数。具体可参考tcp_add_reno_sack()函数相关代码.
*/
u32 fackets_out;/* SACK数和丢失包的总和,fackets_out = lost_out + sacked_out */
tcp_fastretrans_alert()函数被调用条件
(1) 每一个到来的ACK,其状态不是Open.
(2) ACK不是普通ack,即是:
SACK,
Duplicate ACK,
ECE ECN
tcp_fastretrans_alert()函数实现细节
@kernel version 3.12/net/ipv4/tcp_input.c
static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
int prior_sacked, int prior_packets, bool is_dupack, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* is_dupack表示重复ack,FLAG_DATA_SACKED表示SACK中添加了新的数据*/
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
int newly_acked_sacked = 0;
int fast_rexmit = 0;
/* 如果packets_out为0,但sacked_out不为0,那么sacked_out应改为0 */
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
/* 如果sacked_out为0, 那么fackets_out应为0 */
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
tp->fackets_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required.
* 禁止cwnd撤销,并减小cwnd.
*/
if (flag & FLAG_ECE)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs.
* 检查是否为虚假SACK,虚假SACK是指:最新收到的ACK的ack_seq指向已记录的SACK
* 块,这说明记录的SACK并没有反应接收方的真实的状态.
*/
if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Check consistency of the current state.
* 丢失的包应该比发送出去的包少,即left_out < packets_out.
*/
tcp_verify_left_out(tp);
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed.
* 如果state = TCP_CA_Open,就不应该有重传包.
*/
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0; //将重传发送时间置0.
/* 如果snd_una >= high_seq,state接下来应该从其他状态返回到Open状态 */
} else if (!before(tp->snd_una, tp->high_seq)) {
/* state的几种不同值表示网络处在不同的状态,在这篇blog[]()中有详细介绍. */
switch (icsk->icsk_ca_state) {
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
/* 如果snd_una > high_seq,结束快速重传,返回Open状态 */
if (tp->snd_una != tp->high_seq) {
inet_csk(sk)->icsk_retrans_ops->end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
if (tcp_is_reno(tp)) /* 不是sack */
tcp_reset_reno_sack(tp); /* 重置sack_out = 0 */
if (tcp_try_undo_recovery(sk)) /* 尝试撤销 */
return;
/* 结束快速重传 */
inet_csk(sk)->icsk_retrans_ops->end_cwnd_reduction(sk);
break;
}
}
/* 非正常ack处理情况 */
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
/* FLAG_SND_UNA_ADVANCED表示snd_una更新了 */
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
/* 不是sack,是一个dupack则增加sacked_out */
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else
/* 这个函数见下文 */
do_lost = tcp_try_undo_partial(sk, pkts_acked);
/* 计算ack了多少新数据 */
newly_acked_sacked = prior_packets - tp->packets_out +
tp->sacked_out - prior_sacked;
break;
/* timeout后的处理*/
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp); /* 重置sacked_out = 0 */
if (is_dupack)
tcp_add_reno_sack(sk);
}
/* 计算ack了多少新数据*/
newly_acked_sacked = prior_packets - tp->packets_out +
tp->sacked_out - prior_sacked;
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag, newly_acked_sacked);
return;
}
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);/* 做一个简单的转发,而不使用回退机制 */
return;
}
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, (flag & FLAG_ECE)); /* 进入恢复状态 */
fast_rexmit = 1;/* 快速重传标志 */
}
/* 打上lost标志 */
if (do_lost || (tcp_is_fack(tp) && tcp_head_timeout(sk))) {
/* 更新记分牌,标记丢失和超时的数据包 */
tcp_update_scoreboard(sk, fast_rexmit);
}
/* 降低cwnd */
inet_csk(sk)->icsk_retrans_ops->cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
/* 重传有lost标志的包 */
tcp_xmit_retransmit_queue(sk);
}
tcp_add_reno_sack()函数
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++; /* 收到重复ack,sacked_out++*/
/* 检查乱序情况,该函数具体定义在下面介绍 */
tcp_check_reno_reordering(sk, 0);
tcp_verify_left_out(tp);
}
tcp_check_reno_reordering()函数
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
/* 检查sack的数量是否超过了限度,是则更新reordering */
if (tcp_limit_reno_sacked(tp))
tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
tcp_limit_reno_sacked()函数
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
return true;
}
return false;
}
tcp_update_scoreboard()函数
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {/* 不是SACK */
tcp_mark_head_lost(sk, 1, 1);/* 标记一个丢失 */
} else if (tcp_is_fack(tp)) {/* 如果是fack */
int lost = tp->fackets_out - tp->reordering;/* 计算所有的丢包数 */
if (lost <= 0)
lost = 1;
tcp_mark_head_lost(sk, lost, 0);/* 给所有丢包打标记 */
} else {/* 是一个简单的sack */
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
}
tcp_mark_head_lost()函数
* Detect loss in event "A" above by marking head of queue up as lost.
* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
* high_seq:可以标记为lost的段序号的最大值。
* mark_head: 为1表示只需要标志发送队列的第一个段。
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt, oldcnt;
int err;
unsigned int mss;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
/* 丢失的包不可能必发出去的包还多 */
WARN_ON(packets > tp->packets_out);
/* 如果已经有被标记的段了 */
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;/* 让skb指向这个段,便于后面的遍历 */
cnt = tp->lost_cnt_hint;/* 已经标记了多少段 */
/* Head already handled? */
/* 已经有标记但,skb不等于发送队列的第一个包,则返回 */
if (mark_head && skb != tcp_write_queue_head(sk))
return;
} else {
skb = tcp_write_queue_head(sk);/* 获得发送队列第一个包 */
cnt = 0;/* 初始化标记了0个数据 */
}
tcp_for_write_queue_from(skb, sk) {/* 根据取出来的skb,遍历重传队列 */
if (skb == tcp_send_head(sk))
break;/* 如果遍历到snd_nxt,则停止 */
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;/* 暗示已经标记有多少丢包 */
/* loss_high是最大的标记为lost的序号,end_seq不可能大于它 */
if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
oldcnt = cnt;
if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);/* 此段已经被sacked */
/* 主要用于判断时机 */
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
if (err < 0)
break;
cnt = packets;
}
tcp_skb_mark_lost(tp, skb);
if (mark_head)/* 只标记一段的话,那么就可以退出了 */
break;
}
tcp_verify_left_out(tp);
}
tcp_skb_mark_lost()函数
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tcp_verify_retransmit_hint(tp, skb);/* 更新重传队列 */
tp->lost_out += tcp_skb_pcount(skb);/* 统计丢包数 */
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;/* 打上丢包标记 */
}
}