418 lines
13 KiB
C
418 lines
13 KiB
C
/*
|
|
* Copyright (c) 2020-2021 Apple Inc. All rights reserved.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
|
|
*
|
|
* This file contains Original Code and/or Modifications of Original Code
|
|
* as defined in and that are subject to the Apple Public Source License
|
|
* Version 2.0 (the 'License'). You may not use this file except in
|
|
* compliance with the License. The rights granted to you under the License
|
|
* may not be used to create, or enable the creation or redistribution of,
|
|
* unlawful or unlicensed copies of an Apple operating system, or to
|
|
* circumvent, violate, or enable the circumvention or violation of, any
|
|
* terms of an Apple operating system software license agreement.
|
|
*
|
|
* Please obtain a copy of the License at
|
|
* http://www.opensource.apple.com/apsl/ and read it before using this file.
|
|
*
|
|
* The Original Code and all software distributed under the License are
|
|
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
|
|
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
|
|
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
|
|
* Please see the License for the specific language governing rights and
|
|
* limitations under the License.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
|
|
*/
|
|
|
|
#include "tcp_includes.h"
|
|
|
|
/*
|
|
* This file implements a LBE congestion control algorithm
|
|
* to compute the receive window of a background transport
|
|
* which uses same algorithm as ledbat-plus-plus.
|
|
*/
|
|
|
|
#define GAIN_CONSTANT (16)
|
|
#define TCP_BASE_RTT_INTERVAL (60 * TCP_RETRANSHZ)
|
|
|
|
void tcp_rledbat_init(struct tcpcb *tp);
|
|
void tcp_rledbat_cleanup(struct tcpcb *tp);
|
|
void tcp_rledbat_rwnd_init(struct tcpcb *tp);
|
|
void tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
|
|
struct tcpopt *to, uint32_t segment_len);
|
|
uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
|
|
void tcp_rledbat_after_idle(struct tcpcb *tp);
|
|
void tcp_rledbat_switch_to(struct tcpcb *tp);
|
|
|
|
struct tcp_rcv_cc_algo tcp_cc_rledbat = {
|
|
.name = "rledbat",
|
|
.init = tcp_rledbat_init,
|
|
.cleanup = tcp_rledbat_cleanup,
|
|
.rwnd_init = tcp_rledbat_rwnd_init,
|
|
.data_rcvd = tcp_rledbat_data_rcvd,
|
|
.get_rlwin = tcp_rledbat_get_rlwin,
|
|
.after_idle = tcp_rledbat_after_idle,
|
|
.switch_to = tcp_rledbat_switch_to,
|
|
};
|
|
|
|
static inline void
|
|
rledbat_clear_state(struct tcpcb *tp)
|
|
{
|
|
tp->t_rlstate.num_slowdown_events = 0;
|
|
tp->t_rlstate.slowdown_ts = 0;
|
|
tp->t_rlstate.slowdown_begin = 0;
|
|
tp->t_rlstate.rcvd_bytes = 0;
|
|
tp->t_rlstate.md_rcvd_bytes = 0;
|
|
tp->t_rlstate.drained_bytes = 0;
|
|
}
|
|
|
|
void
|
|
tcp_rledbat_init(struct tcpcb *tp)
|
|
{
|
|
os_atomic_inc(&tcp_cc_rledbat.num_sockets, relaxed);
|
|
rledbat_clear_state(tp);
|
|
|
|
tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
|
|
tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
|
|
}
|
|
|
|
void
|
|
tcp_rledbat_cleanup(struct tcpcb *tp)
|
|
{
|
|
#pragma unused(tp)
|
|
os_atomic_dec(&tcp_cc_rledbat.num_sockets, relaxed);
|
|
}
|
|
|
|
/*
|
|
* Initialize the receive window for a connection
|
|
*/
|
|
void
|
|
tcp_rledbat_rwnd_init(struct tcpcb *tp)
|
|
{
|
|
tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
|
|
|
|
/* If the ssthresh hasn't been set, do it now */
|
|
if (tp->t_rlstate.ssthresh == 0) {
|
|
tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Compute the denominator
|
|
* MIN(16, ceil(2 * TARGET / base))
|
|
*/
|
|
static uint32_t
|
|
rledbat_gain(uint32_t base_rtt)
|
|
{
|
|
return MIN(GAIN_CONSTANT, tcp_ceil(2 * target_qdelay /
|
|
(double)base_rtt));
|
|
}
|
|
|
|
/*
|
|
* Congestion avoidance for ledbat++
|
|
*/
|
|
static void
|
|
rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
|
|
uint32_t base_rtt, uint32_t curr_rtt, uint32_t now)
|
|
{
|
|
uint32_t update = 0;
|
|
/*
|
|
* Set the next slowdown time i.e. 9 times the duration
|
|
* of previous slowdown except the initial slowdown.
|
|
*
|
|
* Updated: we will slowdown once in 60s based on our
|
|
* base RTT interval.
|
|
*/
|
|
if (tp->t_rlstate.slowdown_ts == 0) {
|
|
uint32_t slowdown_duration = TCP_BASE_RTT_INTERVAL;
|
|
if (tp->t_rlstate.num_slowdown_events > 0) {
|
|
if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
|
|
/*
|
|
* Special case for slowdowns (other than initial)
|
|
* where cwnd doesn't recover fully to previous
|
|
* ssthresh
|
|
*/
|
|
slowdown_duration *= 2;
|
|
}
|
|
}
|
|
tp->t_rlstate.slowdown_ts = now + slowdown_duration;
|
|
|
|
/* Reset the start */
|
|
tp->t_rlstate.slowdown_begin = 0;
|
|
|
|
/* On exit slow start due to higher qdelay, cap the ssthresh */
|
|
if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
|
|
tp->t_rlstate.ssthresh = tp->t_rlstate.win;
|
|
}
|
|
}
|
|
|
|
if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
|
|
/* Additive increase */
|
|
tp->t_rlstate.rcvd_bytes += segment_len;
|
|
if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
|
|
update = tp->t_maxseg;
|
|
tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
|
|
/*
|
|
* Move background slow-start threshold to current
|
|
* congestion window so that the next time (after some idle
|
|
* period), we can attempt to do slow-start till here if there
|
|
* is no increase in rtt
|
|
*/
|
|
if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
|
|
tp->t_rlstate.ssthresh = tp->t_rlstate.win;
|
|
}
|
|
tp->t_rlstate.win += update;
|
|
tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win, tp->t_maxseg),
|
|
TCP_MAXWIN << tp->rcv_scale);
|
|
}
|
|
} else {
|
|
/*
|
|
* If we are still within 1 RTT of previous reduction
|
|
* due to loss, do nothing
|
|
*/
|
|
if (now < tp->t_rlstate.reduction_end) {
|
|
return;
|
|
}
|
|
/*
|
|
* Multiplicative decrease
|
|
* W -= min(W * (qdelay/target - 1), W/2) (per RTT)
|
|
* To calculate per bytes acked, it becomes
|
|
* W -= min((qdelay/target - 1), 1/2) * bytes_acked
|
|
*/
|
|
uint32_t qdelay = curr_rtt > base_rtt ?
|
|
(curr_rtt - base_rtt) : 0;
|
|
|
|
tp->t_rlstate.md_rcvd_bytes += segment_len;
|
|
if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
|
|
update = (uint32_t)(MIN(((double)qdelay / target_qdelay - 1), 0.5) *
|
|
(double)tp->t_rlstate.win);
|
|
tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
|
|
tp->t_rlstate.win -= update;
|
|
|
|
if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
|
|
tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
|
|
}
|
|
|
|
tp->t_rlstate.win = tcp_round_to(tp->t_rlstate.win, tp->t_maxseg);
|
|
/*
|
|
* Lower background slow-start threshold so that the connection
|
|
* will stay in congestion avoidance phase
|
|
*/
|
|
if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
|
|
tp->t_rlstate.ssthresh = tp->t_rlstate.win;
|
|
}
|
|
|
|
if (tp->t_rlstate.slowdown_ts != 0) {
|
|
/* As the window has been reduced, defer the slowdown. */
|
|
tp->t_rlstate.slowdown_ts = now + TCP_BASE_RTT_INTERVAL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Update win based on ledbat++ algo
|
|
*/
|
|
void
|
|
tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
|
|
struct tcpopt *to, uint32_t segment_len)
|
|
{
|
|
uint32_t update = 0;
|
|
const uint32_t base_rtt = get_base_rtt(tp);
|
|
const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
|
|
tp->t_rttcur;
|
|
const uint32_t srtt = tp->rcv_srtt >> TCP_RTT_SHIFT;
|
|
const uint32_t ss_target = (uint32_t)(3 * target_qdelay / 4);
|
|
tp->t_rlstate.drained_bytes += segment_len;
|
|
struct tcp_globals *globals = tcp_get_globals(tp);
|
|
|
|
/*
|
|
* Slowdown period - first slowdown
|
|
* is 2RTT after we exit initial slow start.
|
|
* Subsequent slowdowns are after 9 times the
|
|
* previous slow down durations.
|
|
*
|
|
* Updated: slowdown periods are once
|
|
* every 60s unless they are deferred.
|
|
*/
|
|
if (tp->t_rlstate.slowdown_ts != 0 &&
|
|
tcp_globals_now(globals) >= tp->t_rlstate.slowdown_ts) {
|
|
if (tp->t_rlstate.slowdown_begin == 0) {
|
|
tp->t_rlstate.slowdown_begin = tcp_globals_now(globals);
|
|
tp->t_rlstate.num_slowdown_events++;
|
|
}
|
|
if (tcp_globals_now(globals) < tp->t_rlstate.slowdown_ts + (2 * srtt)) {
|
|
// Set rwnd to 2 packets and return
|
|
if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
|
|
if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
|
|
tp->t_rlstate.ssthresh = tp->t_rlstate.win;
|
|
}
|
|
tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
|
|
/* Reset total bytes acked */
|
|
tp->t_rlstate.rcvd_bytes = 0;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Detect retransmissions first by checking if the current
|
|
* received sequence is smaller than largest and its
|
|
* timestamp is higher than the largest so far. Reduce
|
|
* win based on fast recovery only once per effective RTT.
|
|
*
|
|
* Note: As we are detecting retransmissions (not packet loss),
|
|
* we are giving some leeway for the next window reduction.
|
|
*/
|
|
if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
|
|
TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
|
|
if (tcp_globals_now(globals) < tp->t_rlstate.reduction_end) {
|
|
/* still need to wait for reduction end to elapse */
|
|
return;
|
|
}
|
|
|
|
uint32_t win = tp->t_rlstate.win / 2;
|
|
win = tcp_round_to(win, tp->t_maxseg);
|
|
if (win < 2 * tp->t_maxseg) {
|
|
win = 2 * tp->t_maxseg;
|
|
}
|
|
tp->t_rlstate.ssthresh = win;
|
|
tp->t_rlstate.win = win;
|
|
|
|
/* Reset the received bytes */
|
|
tp->t_rlstate.rcvd_bytes = 0;
|
|
tp->t_rlstate.md_rcvd_bytes = 0;
|
|
|
|
/* Update the reduction end time */
|
|
tp->t_rlstate.reduction_end = tcp_globals_now(globals) + 2 * srtt;
|
|
|
|
if (tp->t_rlstate.slowdown_ts != 0) {
|
|
/* As the window has been halved, defer the slowdown. */
|
|
tp->t_rlstate.slowdown_ts = tcp_globals_now(globals) +
|
|
TCP_BASE_RTT_INTERVAL;
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* Now we can do slow start or CA */
|
|
if (curr_rtt == 0 || base_rtt == 0) {
|
|
update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
|
|
tp->t_maxseg);
|
|
tp->t_rlstate.win += update;
|
|
tp->t_rlstate.win = min(tp->t_rlstate.win,
|
|
TCP_MAXWIN << tp->rcv_scale);
|
|
} else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
|
|
((tp->t_rlstate.num_slowdown_events > 0 &&
|
|
curr_rtt <= (base_rtt + ((uint32_t)target_qdelay << 1))) ||
|
|
curr_rtt <= (base_rtt + ss_target))) {
|
|
/*
|
|
* Modified slow start with a dynamic GAIN
|
|
* If the queuing delay is larger than 3/4 of the target
|
|
* delay, exit slow start, iff, it is the initial slow start.
|
|
* After the initial slow start, during CA, window growth
|
|
* will be bound by ssthresh.
|
|
*
|
|
* We enter slow start again only after a slowdown event
|
|
* and in that case, we want to allow the window to grow. The
|
|
* check for target_qdelay is only a safety net in case
|
|
* the queuing delay increases more than twice.
|
|
*/
|
|
tp->t_rlstate.rcvd_bytes += segment_len;
|
|
uint32_t gain_factor = rledbat_gain(base_rtt);
|
|
if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
|
|
update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
|
|
TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
|
|
tp->t_rlstate.rcvd_bytes = 0;
|
|
tp->t_rlstate.win += update;
|
|
tp->t_rlstate.win = min(tcp_round_to(tp->t_rlstate.win, tp->t_maxseg),
|
|
TCP_MAXWIN << tp->rcv_scale);
|
|
}
|
|
|
|
/* Reset the next slowdown timestamp */
|
|
if (tp->t_rlstate.slowdown_ts != 0) {
|
|
tp->t_rlstate.slowdown_ts = 0;
|
|
}
|
|
} else {
|
|
/* Congestion avoidance */
|
|
rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt, tcp_globals_now(globals));
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
tcp_rledbat_get_rlwin(struct tcpcb *tp)
|
|
{
|
|
/* rlwin is either greater or smaller by at most drained bytes */
|
|
if (tp->t_rlstate.win > tp->t_rlstate.win_ws ||
|
|
tp->t_rlstate.win_ws - tp->t_rlstate.win < tp->t_rlstate.drained_bytes) {
|
|
tp->t_rlstate.win_ws = tp->t_rlstate.win;
|
|
} else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
|
|
/*
|
|
* rlwin is smaller, decrease the advertised window
|
|
* only by drained bytes at a time
|
|
*/
|
|
tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
|
|
tp->t_rlstate.drained_bytes;
|
|
}
|
|
tp->t_rlstate.drained_bytes = 0;
|
|
/* Round up to the receive window scale */
|
|
tp->t_rlstate.win_ws = tcp_round_up(tp->t_rlstate.win_ws, 1 << tp->rcv_scale);
|
|
|
|
return tp->t_rlstate.win_ws;
|
|
}
|
|
|
|
/*
|
|
* Function to handle connections that have been idle for
|
|
* some time. Slow start to get ack "clock" running again.
|
|
* Clear base history after idle time.
|
|
*/
|
|
void
|
|
tcp_rledbat_after_idle(struct tcpcb *tp)
|
|
{
|
|
rledbat_clear_state(tp);
|
|
/* Reset the rledbat window */
|
|
tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
|
|
}
|
|
|
|
void
|
|
tcp_rledbat_switch_to(struct tcpcb *tp)
|
|
{
|
|
rledbat_clear_state(tp);
|
|
uint32_t recwin = 0;
|
|
|
|
if (tp->t_rlstate.win == 0) {
|
|
/*
|
|
* Use half of previous window, the algorithm
|
|
* will quickly reduce the window if there is still
|
|
* high queueing delay.
|
|
*/
|
|
int32_t win = tcp_sbspace(tp);
|
|
if (win < 0) {
|
|
win = 0;
|
|
}
|
|
|
|
recwin = MAX(win, (int)(tp->rcv_adv - tp->rcv_nxt));
|
|
recwin = recwin / 2;
|
|
} else {
|
|
/*
|
|
* Reduce the window by half from the previous value
|
|
* but it should be at least 64K
|
|
*/
|
|
recwin = MAX(tp->t_rlstate.win / 2, TCP_MAXWIN);
|
|
}
|
|
|
|
recwin = tcp_round_to(recwin, tp->t_maxseg);
|
|
if (recwin < bg_ss_fltsz * tp->t_maxseg) {
|
|
recwin = bg_ss_fltsz * tp->t_maxseg;
|
|
}
|
|
tp->t_rlstate.win = recwin;
|
|
|
|
/* ssthresh should be at most the inital value */
|
|
if (tp->t_rlstate.ssthresh == 0) {
|
|
tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
|
|
} else {
|
|
tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
|
|
TCP_MAXWIN << TCP_MAX_WINSHIFT);
|
|
}
|
|
}
|