tcp: refactor cwnd during SACK transmissions to allow TSO

Refactoring of cwnd and moving the adjustment for SACKed data into
tcp_output() - cwnd tracking the maximum extent starting at snd_una -
allows both SACK loss recovery as well as SACK transmissions after
RTO during slow start and if allowed, the use of TSO while in loss
recovery.

Reviewed By:		tuexen, cc, #transport
Sponsored by:		NetApp, Inc.
Differential Revision:	https://reviews.freebsd.org/D43470
This commit is contained in:
Richard Scheffenegger 2024-10-29 18:47:06 +01:00
parent 72ae04c733
commit 7dc78150c7
3 changed files with 97 additions and 74 deletions

View file

@ -2653,13 +2653,14 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tcp_do_prr_ack(tp, th, &to,
sack_changed, &maxseg);
} else if (tcp_is_sack_recovery(tp, &to) &&
IN_FASTRECOVERY(tp->t_flags)) {
IN_FASTRECOVERY(tp->t_flags) &&
(tp->snd_nxt == tp->snd_max)) {
int awnd;
/*
* Compute the amount of data in flight first.
* We can inject new data into the pipe iff
* we have less than 1/2 the original window's
* we have less than ssthresh
* worth of data in flight.
*/
if (V_tcp_do_newsack) {
@ -2669,10 +2670,18 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
tp->sackhint.sack_bytes_rexmit;
}
if (awnd < tp->snd_ssthresh) {
tp->snd_cwnd += maxseg;
tp->snd_cwnd += imax(maxseg,
imin(2 * maxseg,
tp->sackhint.delivered_data));
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
}
} else if (tcp_is_sack_recovery(tp, &to) &&
IN_FASTRECOVERY(tp->t_flags) &&
SEQ_LT(tp->snd_nxt, tp->snd_max)) {
tp->snd_cwnd += imax(maxseg,
imin(2 * maxseg,
tp->sackhint.delivered_data));
} else {
tp->snd_cwnd += maxseg;
}
@ -2696,14 +2705,13 @@ enter_recovery:
tcp_seq onxt = tp->snd_nxt;
/*
* If we're doing sack, or prr, check
* to see if we're already in sack
* If we're doing sack, check to
* see if we're already in sack
* recovery. If we're not doing sack,
* check to see if we're in newreno
* recovery.
*/
if (V_tcp_do_prr ||
(tp->t_flags & TF_SACK_PERMIT)) {
if (tcp_is_sack_recovery(tp, &to)) {
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
break;
@ -2723,29 +2731,40 @@ enter_recovery:
tp->t_rtttime = 0;
if (V_tcp_do_prr) {
/*
* snd_ssthresh is already updated by
* cc_cong_signal.
* snd_ssthresh and snd_recover are
* already updated by cc_cong_signal.
*/
if (tcp_is_sack_recovery(tp, &to)) {
/*
* Exclude Limited Transmit
* Include Limited Transmit
* segments here
*/
tp->sackhint.prr_delivered =
maxseg;
imin(tp->snd_max - th->th_ack,
(tp->snd_limited + 1) * maxseg);
} else {
tp->sackhint.prr_delivered =
imin(tp->snd_max - tp->snd_una,
imin(INT_MAX / 65536,
tp->t_dupacks) * maxseg);
maxseg;
}
tp->sackhint.recover_fs = max(1,
tp->snd_nxt - tp->snd_una);
}
tp->snd_limited = 0;
if (tcp_is_sack_recovery(tp, &to)) {
TCPSTAT_INC(tcps_sack_recovery_episode);
tp->snd_cwnd = maxseg;
/*
* When entering LR after RTO due to
* Duplicate ACKs, retransmit existing
* holes from the scoreboard.
*/
tcp_resend_sackholes(tp);
/* Avoid inflating cwnd in tcp_output */
tp->snd_nxt = tp->snd_max;
tp->snd_cwnd = tcp_compute_pipe(tp) +
maxseg;
(void) tcp_output(tp);
/* Set cwnd to the expected flightsize */
tp->snd_cwnd = tp->snd_ssthresh;
if (SEQ_GT(th->th_ack, tp->snd_una)) {
goto resume_partialack;
}
@ -2790,7 +2809,8 @@ enter_recovery:
(tp->t_rxtshift == 0))
tp->snd_cwnd =
SEQ_SUB(tp->snd_nxt,
tp->snd_una);
tp->snd_una) -
tcp_sack_adjust(tp);
tp->snd_cwnd +=
(tp->t_dupacks - tp->snd_limited) *
maxseg;
@ -3049,9 +3069,8 @@ process_ACK:
SEQ_GEQ(th->th_ack, tp->snd_recover)) {
cc_post_recovery(tp, th);
}
if (tp->t_flags & TF_SACK_PERMIT) {
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
tp->snd_recover = tp->snd_una;
}
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@ -4138,9 +4157,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
*/
if (IN_FASTRECOVERY(tp->t_flags)) {
if (tcp_is_sack_recovery(tp, to)) {
tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
tp->sackhint.sack_bytes_rexmit +
(snd_cnt * maxseg);
tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
} else {
tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
(snd_cnt * maxseg);
@ -4168,17 +4185,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
/*
* Set snd_cwnd to one segment beyond acknowledged offset.
* (tp->snd_una has not yet been updated when this function is called.)
*/
tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->snd_nxt = th->th_ack;
/*
* Set snd_cwnd to one segment beyond acknowledged offset.
* (tp->snd_una has not yet been updated when this function is called.)
*/
tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
}
/*
* Partial window deflation. Relies on fact that tp->snd_una
* not updated yet.

View file

@ -266,8 +266,10 @@ again:
* resending already delivered data. Adjust snd_nxt accordingly.
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
SEQ_LT(tp->snd_nxt, tp->snd_max))
(tp->sackhint.nexthole != NULL) &&
!IN_FASTRECOVERY(tp->t_flags)) {
sendwin = tcp_sack_adjust(tp);
}
sendalot = 0;
tso = 0;
mtu = 0;
@ -292,10 +294,13 @@ again:
if ((tp->t_flags & TF_SACK_PERMIT) &&
(IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) &&
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
uint32_t cwin;
int32_t cwin;
cwin =
imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
if (IN_FASTRECOVERY(tp->t_flags)) {
cwin = imax(sendwin - tcp_compute_pipe(tp), 0);
} else {
cwin = imax(sendwin - off, 0);
}
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
@ -314,19 +319,34 @@ again:
goto after_sack_rexmit;
} else {
/* Can rexmit part of the current hole */
len = ((int32_t)ulmin(cwin,
SEQ_SUB(tp->snd_recover, p->rxmit)));
len = SEQ_SUB(tp->snd_recover, p->rxmit);
if (cwin <= len) {
len = cwin;
} else {
sendalot = 1;
}
}
} else {
len = ((int32_t)ulmin(cwin,
SEQ_SUB(p->end, p->rxmit)));
len = SEQ_SUB(p->end, p->rxmit);
if (cwin <= len) {
len = cwin;
} else {
sendalot = 1;
}
}
/* we could have transmitted from the scoreboard,
* but sendwin (expected flightsize) - pipe didn't
* allow any transmission.
* Bypass recalculating the possible transmission
* length further down by setting sack_rxmit.
* Wouldn't be here if there would have been
* nothing in the scoreboard to transmit.
*/
sack_rxmit = 1;
if (len > 0) {
off = SEQ_SUB(p->rxmit, tp->snd_una);
KASSERT(off >= 0,("%s: sack block to the left of una : %d",
__func__, off));
sack_rxmit = 1;
sendalot = 1;
}
}
after_sack_rexmit:
@ -390,34 +410,15 @@ after_sack_rexmit:
*/
if (sack_rxmit == 0) {
if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) {
len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
off);
len = imin(sbavail(&so->so_snd), sendwin) - off;
} else {
int32_t cwin;
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
off);
/*
* Don't remove this (len > 0) check !
* We explicitly check for len > 0 here (although it
* isn't really necessary), to work around a gcc
* optimization issue - to force gcc to compute
* len above. Without this check, the computation
* of len is bungled by the optimizer.
*/
if (len > 0) {
cwin = tp->snd_cwnd - imax(0, (int32_t)
(tp->snd_nxt - tp->snd_recover)) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
len = imin(len, cwin);
}
len = imin(sbavail(&so->so_snd) - off,
sendwin - tcp_compute_pipe(tp));
}
}
@ -1647,6 +1648,10 @@ timer:
if ((error == 0) &&
sack_rxmit &&
SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) {
/*
* When transmitting from SACK scoreboard
* after an RTO, pull snd_nxt along.
*/
tp->snd_nxt = SEQ_MIN(p->rxmit, p->end);
}
if (error) {

View file

@ -961,16 +961,15 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp)
/* Send one or 2 segments based on how much new data was acked. */
if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
num_segs = 2;
if (V_tcp_do_newsack) {
tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack +
tp->sackhint.sack_bytes_rexmit -
tp->sackhint.sacked_bytes -
tp->sackhint.lost_bytes, maxseg) +
num_segs * maxseg;
} else {
if (tp->snd_nxt == tp->snd_max) {
tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
imax(0, tp->snd_nxt - tp->snd_recover) +
num_segs * maxseg);
(tp->snd_nxt - tp->snd_recover) + num_segs * maxseg);
} else {
/*
* Since cwnd is not the expected flightsize during
* SACK LR, not deflating cwnd allows the partial
* ACKed amount to be sent.
*/
}
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;