/* * Copyright (c) 2016-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include #include #include #include #include #include #include #include #include #include /* for PE_parse_boot_argn */ #include #include #include #define FQ_CODEL_DEFAULT_QUANTUM 1500 #define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q) #define FQ_CODEL_QUANTUM_BK(_q) (_q) #define FQ_CODEL_QUANTUM_BE(_q) (_q) #define FQ_CODEL_QUANTUM_RD(_q) (_q) #define FQ_CODEL_QUANTUM_OAM(_q) (_q) #define FQ_CODEL_QUANTUM_AV(_q) (_q * 2) #define FQ_CODEL_QUANTUM_RV(_q) (_q * 2) #define FQ_CODEL_QUANTUM_VI(_q) (_q * 2) #define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5) #define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5) static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT); static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT); SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "FQ-CODEL parameters"); SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_enable_pacing, 0, "Enable pacing"); static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY; #if (DEVELOPMENT || DEBUG) SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)"); #endif /* !DEVELOPMENT && !DEBUG */ unsigned int ifclassq_enable_pacing = 1; typedef STAILQ_HEAD(, flowq) flowq_dqlist_t; static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t); static void fq_if_destroy(fq_if_t *fqs); static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority, uint32_t quantum, uint32_t drr_max, uint32_t svc_class); static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t, int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *, uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*); void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat); static void fq_if_purge(fq_if_t *); static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *); static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *, uint64_t); static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl); static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, uint64_t now); static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq); static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all); static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now); static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx); static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now); static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp); static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx); static void fq_if_destroy_grps(fq_if_t *fqs); uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = { [FQ_IF_CTL_INDEX] = 8, [FQ_IF_VO_INDEX] = 8, [FQ_IF_VI_INDEX] = 6, [FQ_IF_RV_INDEX] = 6, [FQ_IF_AV_INDEX] = 6, [FQ_IF_OAM_INDEX] = 4, [FQ_IF_RD_INDEX] = 4, [FQ_IF_BE_INDEX] = 4, [FQ_IF_BK_INDEX] = 2, [FQ_IF_BK_SYS_INDEX] = 2, }; #define FQ_CODEL_DRR_MAX(_s) fq_codel_drr_max_values[FQ_IF_##_s##_INDEX] static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state); static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state); static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state); static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state, fq_if_group_t **selected_grp); static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state); static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state); static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state); static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state); static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state, fq_if_group_t **selected_grp); static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state); bitmap_ops_t fq_if_grps_bitmap_ops = { .ffs = fq_if_grps_bitmap_ffs, .zeros = fq_if_grps_bitmap_zeros, .cpy = fq_if_grps_bitmap_cpy, .clr = fq_if_grps_bitmap_clr, .move = fq_if_grps_bitmap_move, }; bitmap_ops_t fq_if_grps_sc_bitmap_ops = { .ffs = fq_if_grps_sc_bitmap_ffs, .zeros = fq_if_grps_sc_bitmap_zeros, .cpy = fq_if_grps_sc_bitmap_cpy, .clr = fq_if_grps_sc_bitmap_clr, .move = fq_if_grps_sc_bitmap_move, }; void pktsched_fq_init(void) { PE_parse_boot_argn("ifclassq_enable_pacing", &ifclassq_enable_pacing, sizeof(ifclassq_enable_pacing)); // format looks like ifcq_drr_max=8,8,6 char buf[(FQ_IF_MAX_CLASSES) * 3]; size_t i, len, pri_index = 0; uint32_t drr = 0; if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) { return; } len = strlen(buf); for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) { if (buf[i] != ',' && buf[i] != '\0') { VERIFY(buf[i] >= '0' && buf[i] <= '9'); drr = drr * 10 + buf[i] - '0'; continue; } fq_codel_drr_max_values[pri_index] = drr; pri_index += 1; drr = 0; } } #define FQ_IF_FLOW_HASH_ID(_flowid_) \ (((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK) #define FQ_IF_CLASSQ_IDLE(_fcl_) \ (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \ STAILQ_EMPTY(&(_fcl_)->fcl_old_flows)) typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *); typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *, int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *, uint32_t *, boolean_t *, uint64_t); static void fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt) { pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf; } static inline uint64_t fq_codel_get_time(void) { struct timespec ts; uint64_t now; nanouptime(&ts); now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec; return now; } #if SKYWALK static void fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt) { pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt; } #endif /* SKYWALK */ #if SKYWALK static boolean_t fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head, classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt, boolean_t *qempty, uint64_t now) { uint32_t plen; pktsched_pkt_t pkt; boolean_t limit_reached = FALSE; struct ifclassq *ifq = fqs->fqs_ifq; struct ifnet *ifp = ifq->ifcq_ifp; /* * Assert to make sure pflags is part of PKT_F_COMMON_MASK; * all common flags need to be declared in that mask. */ while (fq->fq_deficit > 0 && limit_reached == FALSE && !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) { _PKTSCHED_PKT_INIT(&pkt); fq_getq_flow(fqs, fq, &pkt, now); ASSERT(pkt.pktsched_ptype == QP_PACKET); plen = pktsched_get_pkt_len(&pkt); fq->fq_deficit -= plen; if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) { pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW; fq->fq_flags &= ~FQF_FRESH_FLOW; } if (head->cp_kpkt == NULL) { *head = pkt.pktsched_pkt; } else { ASSERT(tail->cp_kpkt != NULL); ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL); tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt; } *tail = pkt.pktsched_pkt; tail->cp_kpkt->pkt_nextpkt = NULL; fq_cl->fcl_stat.fcl_dequeue++; fq_cl->fcl_stat.fcl_dequeue_bytes += plen; *pkt_cnt += 1; *byte_cnt += plen; ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt); /* Check if the limit is reached */ if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) { limit_reached = TRUE; } } KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay); *qempty = KPKTQ_EMPTY(&fq->fq_kpktq); return limit_reached; } #endif /* SKYWALK */ static boolean_t fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head, classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt, boolean_t *qempty, uint64_t now) { u_int32_t plen; pktsched_pkt_t pkt; boolean_t limit_reached = FALSE; struct ifclassq *ifq = fqs->fqs_ifq; struct ifnet *ifp = ifq->ifcq_ifp; while (fq->fq_deficit > 0 && limit_reached == FALSE && !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) { _PKTSCHED_PKT_INIT(&pkt); fq_getq_flow(fqs, fq, &pkt, now); ASSERT(pkt.pktsched_ptype == QP_MBUF); plen = pktsched_get_pkt_len(&pkt); fq->fq_deficit -= plen; if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) { pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW; fq->fq_flags &= ~FQF_FRESH_FLOW; } if (head->cp_mbuf == NULL) { *head = pkt.pktsched_pkt; } else { ASSERT(tail->cp_mbuf != NULL); ASSERT(tail->cp_mbuf->m_nextpkt == NULL); tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf; } *tail = pkt.pktsched_pkt; tail->cp_mbuf->m_nextpkt = NULL; fq_cl->fcl_stat.fcl_dequeue++; fq_cl->fcl_stat.fcl_dequeue_bytes += plen; *pkt_cnt += 1; *byte_cnt += plen; ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt); /* Check if the limit is reached */ if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) { limit_reached = TRUE; } } KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay); *qempty = MBUFQ_EMPTY(&fq->fq_mbufq); return limit_reached; } static void fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1) { #pragma unused(arg1) struct ifnet* ifp = (struct ifnet*)arg0; ASSERT(ifp != NULL); ifnet_start_ignore_delay(ifp); } fq_if_t * fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype) { fq_if_t *fqs; ASSERT(ifq->ifcq_ifp != NULL); fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO); fqs->fqs_ifq = ifq; fqs->fqs_ptype = ptype; /* Configure packet drop limit across all queues */ fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq); STAILQ_INIT(&fqs->fqs_fclist); TAILQ_INIT(&fqs->fqs_empty_list); TAILQ_INIT(&fqs->fqs_combined_grp_list); fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall, (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); ASSERT(fqs->fqs_pacemaker_tcall != NULL); return fqs; } void fq_if_destroy(fq_if_t *fqs) { struct ifnet *ifp = fqs->fqs_ifq->ifcq_ifp; thread_call_t tcall = fqs->fqs_pacemaker_tcall; VERIFY(ifp != NULL); ASSERT(tcall != NULL); IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED); IFCQ_CONVERT_LOCK(fqs->fqs_ifq); /* * Since we are holding the IFCQ lock here, another thread cannot enter AQM * and schedule a pacemaker call. So we do not need a sleep wait loop here * cancel wait and free should succeed in one call. */ thread_call_cancel_wait(tcall); ASSERT(thread_call_free(tcall)); fq_if_purge(fqs); fq_if_destroy_grps(fqs); fqs->fqs_ifq = NULL; zfree(fq_if_zone, fqs); } static inline uint8_t fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc) { uint8_t pri; if (fqs->fqs_flags & FQS_DRIVER_MANAGED) { switch (svc) { case MBUF_SC_BK_SYS: case MBUF_SC_BK: pri = FQ_IF_BK_INDEX; break; case MBUF_SC_BE: case MBUF_SC_RD: case MBUF_SC_OAM: pri = FQ_IF_BE_INDEX; break; case MBUF_SC_AV: case MBUF_SC_RV: case MBUF_SC_VI: case MBUF_SC_SIG: pri = FQ_IF_VI_INDEX; break; case MBUF_SC_VO: case MBUF_SC_CTL: pri = FQ_IF_VO_INDEX; break; default: pri = FQ_IF_BE_INDEX; /* Use best effort by default */ break; } return pri; } /* scheduler is not managed by the driver */ switch (svc) { case MBUF_SC_BK_SYS: pri = FQ_IF_BK_SYS_INDEX; break; case MBUF_SC_BK: pri = FQ_IF_BK_INDEX; break; case MBUF_SC_BE: pri = FQ_IF_BE_INDEX; break; case MBUF_SC_RD: pri = FQ_IF_RD_INDEX; break; case MBUF_SC_OAM: pri = FQ_IF_OAM_INDEX; break; case MBUF_SC_AV: pri = FQ_IF_AV_INDEX; break; case MBUF_SC_RV: pri = FQ_IF_RV_INDEX; break; case MBUF_SC_VI: pri = FQ_IF_VI_INDEX; break; case MBUF_SC_SIG: pri = FQ_IF_SIG_INDEX; break; case MBUF_SC_VO: pri = FQ_IF_VO_INDEX; break; case MBUF_SC_CTL: pri = FQ_IF_CTL_INDEX; break; default: pri = FQ_IF_BE_INDEX; /* Use best effort by default */ break; } return pri; } void fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum, uint32_t drr_max, uint32_t svc_class) { fq_if_classq_t *fq_cl; VERIFY(pri < FQ_IF_MAX_CLASSES); fq_cl = &fqg->fqg_classq[pri]; VERIFY(fq_cl->fcl_quantum == 0); VERIFY(quantum != 0); fq_cl->fcl_quantum = quantum; fq_cl->fcl_pri = pri; fq_cl->fcl_drr_max = drr_max; fq_cl->fcl_service_class = svc_class; fq_cl->fcl_next_tx_time = 0; fq_cl->fcl_flags = 0; STAILQ_INIT(&fq_cl->fcl_new_flows); STAILQ_INIT(&fq_cl->fcl_old_flows); } int fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop) { uint8_t pri, grp_idx = 0; fq_if_t *fqs; fq_if_classq_t *fq_cl; fq_if_group_t *fq_group; int ret; mbuf_svc_class_t svc; pktsched_pkt_t pkt; pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes); fqs = (fq_if_t *)ifq->ifcq_disc; svc = pktsched_get_pkt_svc(&pkt); #if SKYWALK if (head->cp_ptype == QP_PACKET) { grp_idx = head->cp_kpkt->pkt_qset_idx; } #endif /* SKYWALK */ pri = fq_if_service_to_priority(fqs, svc); VERIFY(pri < FQ_IF_MAX_CLASSES); IFCQ_LOCK_SPIN(ifq); fq_group = fq_if_find_grp(fqs, grp_idx); fq_cl = &fq_group->fqg_classq[pri]; if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) { IFCQ_UNLOCK(ifq); /* BK_SYS is currently throttled */ os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed); pktsched_free_pkt(&pkt); *pdrop = TRUE; ret = EQSUSPENDED; goto done; } ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype); ret = fq_addq(fqs, fq_group, &pkt, fq_cl); if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) & (1 << pri)) == 0) { /* * this group is not in ER or EB groups, * mark it as IB */ pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]); } } if (__improbable(ret != 0)) { if (ret == CLASSQEQ_SUCCESS_FC) { /* packet enqueued, return advisory feedback */ ret = EQFULL; *pdrop = FALSE; } else if (ret == CLASSQEQ_COMPRESSED) { ret = 0; *pdrop = FALSE; } else { IFCQ_UNLOCK(ifq); *pdrop = TRUE; pktsched_free_pkt(&pkt); switch (ret) { case CLASSQEQ_DROP: ret = ENOBUFS; goto done; case CLASSQEQ_DROP_FC: ret = EQFULL; goto done; case CLASSQEQ_DROP_SP: ret = EQSUSPENDED; goto done; default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } /* NOTREACHED */ __builtin_unreachable(); } } else { *pdrop = FALSE; } IFCQ_ADD_LEN(ifq, cnt); IFCQ_INC_BYTES(ifq, bytes); FQS_GRP_ADD_LEN(fqs, grp_idx, cnt); FQS_GRP_INC_BYTES(fqs, grp_idx, bytes); IFCQ_UNLOCK(ifq); done: #if DEBUG || DEVELOPMENT if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) { ret = 0; } #endif /* DEBUG || DEVELOPMENT */ return ret; } void fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx) { (void) fq_if_dequeue_classq_multi(ifq, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); } void fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, classq_pkt_t *pkt, uint8_t grp_idx) { (void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); } static inline void fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq) { ASSERT(fq->fq_dq_head.cp_mbuf == NULL); ASSERT(!fq->fq_in_dqlist); STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink); fq->fq_in_dqlist = true; } static inline void fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head, classq_pkt_t *tail, classq_pkt_type_t ptype) { ASSERT(fq->fq_in_dqlist); if (fq->fq_dq_head.cp_mbuf == NULL) { goto done; } if (head->cp_mbuf == NULL) { *head = fq->fq_dq_head; } else { ASSERT(tail->cp_mbuf != NULL); switch (ptype) { case QP_MBUF: ASSERT(tail->cp_mbuf->m_nextpkt == NULL); tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf; ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL); break; #if SKYWALK case QP_PACKET: ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL); tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt; ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL); break; #endif /* SKYWALK */ default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } } *tail = fq->fq_dq_tail; done: STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink); CLASSQ_PKT_INIT(&fq->fq_dq_head); CLASSQ_PKT_INIT(&fq->fq_dq_tail); fq->fq_in_dqlist = false; } static inline void fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head, classq_pkt_t *tail, classq_pkt_type_t ptype) { fq_t *fq, *tfq; STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) { fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype); } } static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state, fq_if_group_t **selected_grp) { #pragma unused(pri) fq_if_group_t *grp; uint32_t highest_pri = FQ_IF_MAX_CLASSES; int ret_pri = 0; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]); /* bitmap is empty in this case */ if (cur_pri == 0) { continue; } if (cur_pri <= highest_pri) { highest_pri = cur_pri; ret_pri = cur_pri; *selected_grp = grp; } } return ret_pri; } static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state) { #pragma unused(pri) fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { if (grp->fqg_bitmaps[state] != 0) { return FALSE; } } return TRUE; } static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state) { #pragma unused(pri) fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state]; } } static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state) { #pragma unused(pri) fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { grp->fqg_bitmaps[state] = 0; } } static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state) { #pragma unused(pri) fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state]; grp->fqg_bitmaps[src_state] = 0; } } static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state, fq_if_group_t **selected_grp) { fq_if_group_t *grp; int ret_pri = 0; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) { /* +1 to match the semantics of pktsched_ffs */ ret_pri = pri + 1; *selected_grp = grp; break; } } return ret_pri; } static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state) { fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) { return FALSE; } } return TRUE; } static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state) { fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state], &grp->fqg_bitmaps[src_state]); } } static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state) { fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]); } } static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state) { fq_if_group_t *grp; TAILQ_FOREACH(grp, grp_list, fqg_grp_link) { pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state], &grp->fqg_bitmaps[src_state]); pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]); } } /* * Pacemaker is only scheduled when no packet can be dequeued from AQM * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time. * This only applies to L4S traffic at this moment. */ static void fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time) { uint64_t deadline = 0; if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { return; } ASSERT(next_tx_time != FQ_INVALID_TX_TS); ASSERT(fqs->fqs_pacemaker_tcall != NULL); ASSERT(now < next_tx_time); DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp, uint64_t, next_tx_time - now); KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now, next_tx_time, next_tx_time - now); clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline); thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline); } static int fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) { uint32_t total_pktcnt = 0, total_bytecnt = 0; classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp); fq_if_append_pkt_t append_pkt; flowq_dqlist_t fq_dqlist_head; fq_if_classq_t *fq_cl; fq_grp_tailq_t *grp_list, tmp_grp_list; fq_if_group_t *fq_grp = NULL; fq_if_t *fqs; uint64_t now, next_tx_time = FQ_INVALID_TX_TS; int pri = 0, svc_pri = 0; bool all_paced = true; IFCQ_LOCK_ASSERT_HELD(ifq); fqs = (fq_if_t *)ifq->ifcq_disc; STAILQ_INIT(&fq_dqlist_head); switch (fqs->fqs_ptype) { case QP_MBUF: append_pkt = fq_if_append_mbuf; break; #if SKYWALK case QP_PACKET: append_pkt = fq_if_append_pkt; break; #endif /* SKYWALK */ default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } now = fq_codel_get_time(); if (fqs->fqs_flags & FQS_DRIVER_MANAGED) { svc_pri = fq_if_service_to_priority(fqs, svc); } else { VERIFY(svc == MBUF_SC_UNSPEC); } if (fq_if_is_grp_combined(fqs, grp_idx)) { grp_list = &fqs->fqs_combined_grp_list; VERIFY(!TAILQ_EMPTY(grp_list)); } else { grp_list = &tmp_grp_list; fq_grp = fq_if_find_grp(fqs, grp_idx); TAILQ_INIT(grp_list); TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link); } for (;;) { uint32_t pktcnt = 0, bytecnt = 0; classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head); classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); bool fq_cl_all_paced = false; uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS; if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) && fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) { fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB); fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB); if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) { if (ifclassq_enable_pacing && ifclassq_enable_l4s) { /* * Move fq_cl in IR back to ER, so that they will inspected with priority * the next time the driver dequeues */ fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR); fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR); } break; } } pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp); if (pri == 0) { /* * There are no ER flows, move the highest * priority one from EB if there are any in that * category */ pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp); VERIFY(pri > 0); VERIFY(fq_grp != NULL); pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]); pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]); } VERIFY(fq_grp != NULL); pri--; /* index starts at 0 */ fq_cl = &fq_grp->fqg_classq[pri]; if (fq_cl->fcl_budget <= 0) { /* Update the budget */ fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max, fq_cl->fcl_stat.fcl_flows_cnt) * fq_cl->fcl_quantum); if (fq_cl->fcl_budget <= 0) { goto state_change; } } fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced, &fq_cl_next_tx_time); if (head.cp_mbuf != NULL) { ASSERT(STAILQ_EMPTY(&fq_dqlist_head)); if (first.cp_mbuf == NULL) { first = head; } else { ASSERT(last.cp_mbuf != NULL); append_pkt(&last, &head); } last = tail; append_pkt(&last, &tmp); } if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) { fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++; next_tx_time = fq_cl_next_tx_time; } fq_cl->fcl_budget -= bytecnt; total_pktcnt += pktcnt; total_bytecnt += bytecnt; /* * If the class has exceeded the budget but still has data * to send, move it to IB */ state_change: VERIFY(fq_grp != NULL); all_paced &= fq_cl_all_paced; if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { if (fq_cl->fcl_budget <= 0) { pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]); pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]); } else if (fq_cl_all_paced) { if (ifclassq_enable_pacing && ifclassq_enable_l4s) { /* * If a fq_cl still has budget but only paced queues, park it * to IR so that we will not keep loopping over it */ pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]); pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]); } } } else { pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]); VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] | fq_grp->fqg_bitmaps[FQ_IF_EB] | fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0); fq_cl->fcl_budget = 0; } if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) { if (ifclassq_enable_pacing && ifclassq_enable_l4s) { /* * Move fq_cl in IR back to ER, so that they will inspected with priority * the next time the driver dequeues */ fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR); } break; } } if (!fq_if_is_grp_combined(fqs, grp_idx)) { TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link); VERIFY(TAILQ_EMPTY(grp_list)); } fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype); if (__probable(first_packet != NULL)) { *first_packet = first; } if (last_packet != NULL) { *last_packet = last; } if (retpktcnt != NULL) { *retpktcnt = total_pktcnt; } if (retbytecnt != NULL) { *retbytecnt = total_bytecnt; } if (next_tx_time != FQ_INVALID_TX_TS) { ASSERT(next_tx_time > now); fq_if_schedule_pacemaker(fqs, now, next_tx_time); } IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); fq_if_purge_empty_flow_list(fqs, now, false); return 0; } int fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) { return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } int fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) { fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; if (fq_if_is_grp_combined(fqs, grp_idx)) { return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } else { /* * take a shortcut here since there is no need to schedule * one single service class. */ return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } } static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) { fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; uint8_t pri; u_int32_t total_pktcnt = 0, total_bytecnt = 0; fq_if_classq_t *fq_cl; classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_if_append_pkt_t append_pkt; flowq_dqlist_t fq_dqlist_head; fq_if_group_t *fq_grp; uint64_t now; switch (fqs->fqs_ptype) { case QP_MBUF: append_pkt = fq_if_append_mbuf; break; #if SKYWALK case QP_PACKET: append_pkt = fq_if_append_pkt; break; #endif /* SKYWALK */ default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } STAILQ_INIT(&fq_dqlist_head); now = fq_codel_get_time(); pri = fq_if_service_to_priority(fqs, svc); fq_grp = fq_if_find_grp(fqs, grp_idx); fq_cl = &fq_grp->fqg_classq[pri]; /* * Now we have the queue for a particular service class. We need * to dequeue as many packets as needed, first from the new flows * and then from the old flows. */ while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt && fq_cl->fcl_stat.fcl_pkt_cnt > 0) { classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head); classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); u_int32_t pktcnt = 0, bytecnt = 0; bool all_paced = false; uint64_t next_tx_time = FQ_INVALID_TX_TS; fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time); if (head.cp_mbuf != NULL) { if (first.cp_mbuf == NULL) { first = head; } else { ASSERT(last.cp_mbuf != NULL); append_pkt(&last, &head); } last = tail; } total_pktcnt += pktcnt; total_bytecnt += bytecnt; if (next_tx_time != FQ_INVALID_TX_TS) { ASSERT(next_tx_time > now); fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++; fq_if_schedule_pacemaker(fqs, now, next_tx_time); break; } } /* * Mark classq as IB if it's not idle, so that we can * start without re-init the bitmaps when it's switched * to combined mode. */ if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]); pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]); pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]); } else { pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]); VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] | fq_grp->fqg_bitmaps[FQ_IF_EB] | fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0); } fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype); if (__probable(first_packet != NULL)) { *first_packet = first; } if (last_packet != NULL) { *last_packet = last; } if (retpktcnt != NULL) { *retpktcnt = total_pktcnt; } if (retbytecnt != NULL) { *retbytecnt = total_bytecnt; } IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); fq_if_purge_empty_flow_list(fqs, now, false); return 0; } static void fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp, uint32_t *bytesp, uint64_t now) { fq_if_classq_t *fq_cl; u_int32_t pkts, bytes; pktsched_pkt_t pkt; fq_if_group_t *grp; fq_cl = &FQ_CLASSQ(fq); grp = FQ_GROUP(fq); pkts = bytes = 0; _PKTSCHED_PKT_INIT(&pkt); for (;;) { fq_getq_flow(fqs, fq, &pkt, now); if (pkt.pktsched_pkt_mbuf == NULL) { VERIFY(pkt.pktsched_ptype == QP_INVALID); break; } pkts++; bytes += pktsched_get_pkt_len(&pkt); pktsched_free_pkt(&pkt); } KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay); IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes); /* move through the flow queue states */ VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW))); if (fq->fq_flags & FQF_NEW_FLOW) { fq_if_empty_new_flow(fq, fq_cl); } if (fq->fq_flags & FQF_OLD_FLOW) { fq_if_empty_old_flow(fqs, fq_cl, fq, now); } if (fq->fq_flags & FQF_EMPTY_FLOW) { fq_if_purge_empty_flow(fqs, fq); fq = NULL; } if (FQ_IF_CLASSQ_IDLE(fq_cl)) { int i; for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) { pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]); } } if (pktsp != NULL) { *pktsp = pkts; } if (bytesp != NULL) { *bytesp = bytes; } } static void fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl) { fq_t *fq, *tfq; uint64_t now; now = fq_codel_get_time(); /* * Take each flow from new/old flow list and flush mbufs * in that flow */ STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) { fq_if_purge_flow(fqs, fq, NULL, NULL, now); } STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) { fq_if_purge_flow(fqs, fq, NULL, NULL, now); } VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows)); VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows)); STAILQ_INIT(&fq_cl->fcl_new_flows); STAILQ_INIT(&fq_cl->fcl_old_flows); fq_cl->fcl_budget = 0; } static void fq_if_purge(fq_if_t *fqs) { uint64_t now; fq_if_group_t *grp; int i; IFCQ_CONVERT_LOCK(fqs->fqs_ifq); for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { if (fqs->fqs_classq_groups[grp_idx] == NULL) { continue; } grp = fq_if_find_grp(fqs, grp_idx); fq_if_purge_grp(fqs, grp); } now = fq_codel_get_time(); fq_if_purge_empty_flow_list(fqs, now, true); VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist)); VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list)); fqs->fqs_large_flow = NULL; for (i = 0; i < FQ_IF_HASH_TABLE_SIZE; i++) { VERIFY(SLIST_EMPTY(&fqs->fqs_flows[i])); } IFCQ_LEN(fqs->fqs_ifq) = 0; IFCQ_BYTES(fqs->fqs_ifq) = 0; } static void fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req) { fq_t *fq; uint64_t now; fq_if_group_t *grp; IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); req->packets = req->bytes = 0; VERIFY(req->flow != 0); now = fq_codel_get_time(); for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { if (fqs->fqs_classq_groups[grp_idx] == NULL) { continue; } uint32_t bytes = 0, pkts = 0; grp = fq_if_find_grp(fqs, grp_idx); /* * Packet and traffic type are needed only if we want * to create a flow queue. */ fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C); if (fq != NULL) { fq_if_purge_flow(fqs, fq, &pkts, &bytes, now); req->bytes += bytes; req->packets += pkts; } } } static uint16_t fq_if_calc_quantum(struct ifnet *ifp) { uint16_t quantum; switch (ifp->if_family) { case IFNET_FAMILY_ETHERNET: VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX); quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN; break; case IFNET_FAMILY_CELLULAR: case IFNET_FAMILY_IPSEC: case IFNET_FAMILY_UTUN: VERIFY(ifp->if_mtu <= UINT16_MAX); quantum = (uint16_t)ifp->if_mtu; break; default: quantum = FQ_CODEL_DEFAULT_QUANTUM; break; } if ((ifp->if_hwassist & IFNET_TSOF) != 0) { VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX); VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX); quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu); quantum = (quantum != 0) ? quantum : IF_MAXMTU; } quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum); #if DEBUG || DEVELOPMENT quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum; #endif /* DEBUG || DEVELOPMENT */ VERIFY(quantum != 0); return quantum; } static void fq_if_mtu_update(fq_if_t *fqs) { #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q) \ (_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \ FQ_CODEL_QUANTUM_ ## _s(_q) \ uint32_t quantum; fq_if_group_t *grp; quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp); for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { if (fqs->fqs_classq_groups[grp_idx] == NULL) { continue; } grp = fq_if_find_grp(fqs, grp_idx); if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) { _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum); } else { _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum); _FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum); } } #undef _FQ_CLASSQ_UPDATE_QUANTUM } static void fq_if_event(fq_if_t *fqs, cqev_t ev) { IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); switch (ev) { case CLASSQ_EV_LINK_UP: case CLASSQ_EV_LINK_DOWN: fq_if_purge(fqs); break; case CLASSQ_EV_LINK_MTU: fq_if_mtu_update(fqs); break; default: break; } } static void fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl) { fq_if_purge_classq(fqs, fq_cl); fqs->fqs_throttle = 1; fq_cl->fcl_stat.fcl_throttle_on++; KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START, fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0); } static void fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl) { VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl)); fqs->fqs_throttle = 0; fq_cl->fcl_stat.fcl_throttle_off++; KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END, fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0); } static int fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr) { struct ifclassq *ifq = fqs->fqs_ifq; uint8_t index; fq_if_group_t *grp; #if !MACH_ASSERT #pragma unused(ifq) #endif IFCQ_LOCK_ASSERT_HELD(ifq); if (!tr->set) { tr->level = fqs->fqs_throttle; return 0; } if (tr->level == fqs->fqs_throttle) { return EALREADY; } /* Throttling is allowed on BK_SYS class only */ index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS); for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { if (fqs->fqs_classq_groups[grp_idx] == NULL) { continue; } grp = fq_if_find_grp(fqs, grp_idx); switch (tr->level) { case IFNET_THROTTLE_OFF: fq_if_classq_resume(fqs, &grp->fqg_classq[index]); break; case IFNET_THROTTLE_OPPORTUNISTIC: fq_if_classq_suspend(fqs, &grp->fqg_classq[index]); break; default: break; } } return 0; } static inline boolean_t fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now) { if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) { return true; } fq_cl->fcl_flags &= ~FCL_PACED; fq_cl->fcl_next_tx_time = 0; return false; } static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now) { uint8_t pri; fq_if_classq_t *fq_cl; ASSERT(stat != NULL); pri = fq_if_service_to_priority(fqs, stat->sc); fq_cl = &grp->fqg_classq[pri]; stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt; stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt; if (ifclassq_enable_pacing && ifclassq_enable_l4s && fq_if_is_fq_cl_paced(fq_cl, now)) { stat->packets = 0; stat->bytes = 0; } } static boolean_t fq_if_is_grp_all_paced(fq_if_group_t *grp) { fq_if_classq_t *fq_cl; uint64_t now; if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { return false; } now = fq_codel_get_time(); for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) { fq_cl = &grp->fqg_classq[fq_cl_idx]; if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) { continue; } if (!fq_if_is_fq_cl_paced(fq_cl, now)) { return false; } } return true; } boolean_t fq_if_is_all_paced(struct ifclassq *ifq) { fq_if_group_t *grp; fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; IFCQ_LOCK_ASSERT_HELD(ifq); if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { return false; } for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { grp = fqs->fqs_classq_groups[grp_idx]; if (grp == NULL || FQG_BYTES(grp) == 0) { continue; } if (!fq_if_is_grp_all_paced(grp)) { return false; } } return true; } void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat) { cqrq_stat_sc_t grp_sc_stat; fq_if_group_t *grp; uint64_t now = fq_codel_get_time(); if (stat == NULL) { return; } grp_sc_stat.sc = stat->sc; stat->packets = 0; stat->bytes = 0; if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) { if (stat->sc == MBUF_SC_UNSPEC) { if (!fq_if_is_all_paced(fqs->fqs_ifq)) { stat->packets = IFCQ_LEN(fqs->fqs_ifq); stat->bytes = IFCQ_BYTES(fqs->fqs_ifq); } } else { for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { grp = fqs->fqs_classq_groups[grp_idx]; if (grp == NULL) { continue; } fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now); stat->packets += grp_sc_stat.packets; stat->bytes += grp_sc_stat.bytes; } } return; } if (stat->sc == MBUF_SC_UNSPEC) { if (fq_if_is_grp_combined(fqs, stat->grp_idx)) { TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) { if (fq_if_is_grp_all_paced(grp)) { continue; } stat->packets += FQG_LEN(grp); stat->bytes += FQG_BYTES(grp); } } else { grp = fq_if_find_grp(fqs, stat->grp_idx); if (!fq_if_is_grp_all_paced(grp)) { stat->packets = FQG_LEN(grp); stat->bytes = FQG_BYTES(grp); } } } else { if (fq_if_is_grp_combined(fqs, stat->grp_idx)) { TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) { if (fq_if_is_grp_all_paced(grp)) { continue; } fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now); stat->packets += grp_sc_stat.packets; stat->bytes += grp_sc_stat.bytes; } } else { grp = fq_if_find_grp(fqs, stat->grp_idx); fq_if_grp_stat_sc(fqs, grp, stat, now); } } } int fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg) { int err = 0; fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; IFCQ_LOCK_ASSERT_HELD(ifq); /* * These are usually slow operations, convert the lock ahead of time */ IFCQ_CONVERT_LOCK(fqs->fqs_ifq); switch (rq) { case CLASSQRQ_PURGE: fq_if_purge(fqs); break; case CLASSQRQ_PURGE_SC: fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg); break; case CLASSQRQ_EVENT: fq_if_event(fqs, (cqev_t)arg); break; case CLASSQRQ_THROTTLE: fq_if_throttle(fqs, (cqrq_throttle_t *)arg); break; case CLASSQRQ_STAT_SC: fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg); break; } return err; } int fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, classq_pkt_type_t ptype) { fq_if_t *fqs = NULL; int err = 0; IFCQ_LOCK_ASSERT_HELD(ifq); VERIFY(ifq->ifcq_disc == NULL); VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); fqs = fq_if_alloc(ifq, ptype); if (fqs == NULL) { return ENOMEM; } if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) { fqs->fqs_flags |= FQS_DRIVER_MANAGED; fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops; } else { fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops; } err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs); if (err != 0) { os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, " "failed to attach fq_if: %d\n", __func__, err); fq_if_destroy(fqs); return err; } /* * Always create one group. If qset 0 is added later, * this group will be updated. */ err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF); if (err != 0) { os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, " "failed to create a fq group: %d\n", __func__, err); fq_if_destroy(fqs); } return err; } fq_t * fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, u_int32_t flowid, mbuf_svc_class_t svc_class, u_int64_t now, bool create, fq_tfc_type_t tfc_type) { fq_t *fq = NULL; flowq_list_t *fq_list; fq_if_classq_t *fq_cl; u_int8_t fqs_hash_id; u_int8_t scidx; scidx = fq_if_service_to_priority(fqs, svc_class); fqs_hash_id = FQ_IF_FLOW_HASH_ID(flowid); fq_list = &fqs->fqs_flows[fqs_hash_id]; SLIST_FOREACH(fq, fq_list, fq_hashlink) { if (fq->fq_flowhash == flowid && fq->fq_sc_index == scidx && fq->fq_tfc_type == tfc_type && fq->fq_group == fq_grp) { break; } } if (fq == NULL && create) { /* If the flow is not already on the list, allocate it */ IFCQ_CONVERT_LOCK(fqs->fqs_ifq); fq = fq_alloc(fqs->fqs_ptype); if (fq != NULL) { fq->fq_flowhash = flowid; fq->fq_sc_index = scidx; fq->fq_group = fq_grp; fq->fq_tfc_type = tfc_type; fq_cl = &FQ_CLASSQ(fq); fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW); fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq); fq->fq_next_tx_time = FQ_INVALID_TX_TS; SLIST_INSERT_HEAD(fq_list, fq, fq_hashlink); fq_cl->fcl_stat.fcl_flows_cnt++; } KDBG(AQM_KTRACE_STATS_FLOW_ALLOC, fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0); } else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) { fq_if_reuse_empty_flow(fqs, fq, now); } /* * If getq time is not set because this is the first packet or after * idle time, set it now so that we can detect a stall. */ if (fq != NULL && fq->fq_getqtime == 0) { fq->fq_getqtime = now; } return fq; } void fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq) { u_int8_t hash_id; ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0); hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash); SLIST_REMOVE(&fqs->fqs_flows[hash_id], fq, flowq, fq_hashlink); IFCQ_CONVERT_LOCK(fqs->fqs_ifq); if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) { fq_if_flow_feedback(fqs, fq, fq_cl); } KDBG(AQM_KTRACE_STATS_FLOW_DESTROY, fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0); fq_destroy(fq, fqs->fqs_ptype); } inline boolean_t fq_if_at_drop_limit(fq_if_t *fqs) { return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ? TRUE : FALSE; } inline boolean_t fq_if_almost_at_drop_limit(fq_if_t *fqs) { /* * Whether we are above 90% of the queue limit. This is used to tell if we * can stop flow controlling the largest flow. */ return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10; } static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now) { ASSERT(fq->fq_flags & FQF_EMPTY_FLOW); TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link); STAILQ_NEXT(fq, fq_actlink) = NULL; fq->fq_flags &= ~FQF_FLOW_STATE_MASK; fq->fq_empty_purge_time = 0; fq->fq_getqtime = 0; fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq); fqs->fqs_empty_list_cnt--; fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); fq_cl->fcl_stat.fcl_flows_cnt++; } inline void fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, uint64_t now) { ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON)); fq->fq_empty_purge_time = now + fq_empty_purge_delay; TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link); fq->fq_flags |= FQF_EMPTY_FLOW; FQ_CLEAR_OVERWHELMING(fq); fqs->fqs_empty_list_cnt++; /* * fcl_flows_cnt is used in budget determination for the class. * empty flow shouldn't contribute to the budget. */ fq_cl->fcl_stat.fcl_flows_cnt--; } static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq) { fq_if_classq_t *fq_cl; fq_cl = &FQ_CLASSQ(fq); ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0); TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link); fq->fq_flags &= ~FQF_EMPTY_FLOW; fqs->fqs_empty_list_cnt--; /* Remove from the hash list and free the flow queue */ fq_if_destroy_flow(fqs, fq_cl, fq); } static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all) { fq_t *fq, *tmp; int i = 0; if (fqs->fqs_empty_list_cnt == 0) { ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list)); return; } TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) { if (!purge_all && ((now < fq->fq_empty_purge_time) || (i++ == FQ_EMPTY_PURGE_MAX))) { break; } fq_if_purge_empty_flow(fqs, fq); } if (__improbable(purge_all)) { VERIFY(fqs->fqs_empty_list_cnt == 0); VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list)); } } static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, uint64_t now) { /* * Remove the flow queue from the old flows list. */ STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink); fq->fq_flags &= ~FQF_OLD_FLOW; fq_cl->fcl_stat.fcl_oldflows_cnt--; VERIFY(fq->fq_bytes == 0); /* release any flow control */ if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) { fq_if_flow_feedback(fqs, fq, fq_cl); } /* move the flow queue to empty flows list */ fq_if_move_to_empty_flow(fqs, fq_cl, fq, now); } static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl) { /* Move to the end of old queue list */ STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq, flowq, fq_actlink); fq->fq_flags &= ~FQF_NEW_FLOW; fq_cl->fcl_stat.fcl_newflows_cnt--; STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink); fq->fq_flags |= FQF_OLD_FLOW; fq_cl->fcl_stat.fcl_oldflows_cnt++; } inline void fq_if_drop_packet(fq_if_t *fqs, uint64_t now) { fq_t *fq = fqs->fqs_large_flow; fq_if_classq_t *fq_cl; pktsched_pkt_t pkt; volatile uint32_t *pkt_flags; uint64_t *pkt_timestamp; if (fq == NULL) { return; } /* queue can not be empty on the largest flow */ VERIFY(!fq_empty(fq, fqs->fqs_ptype)); fq_cl = &FQ_CLASSQ(fq); _PKTSCHED_PKT_INIT(&pkt); fq_getq_flow_internal(fqs, fq, &pkt); ASSERT(pkt.pktsched_ptype != QP_INVALID); pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, NULL, NULL, NULL); IFCQ_CONVERT_LOCK(fqs->fqs_ifq); *pkt_timestamp = 0; switch (pkt.pktsched_ptype) { case QP_MBUF: *pkt_flags &= ~PKTF_PRIV_GUARDED; break; #if SKYWALK case QP_PACKET: /* sanity check */ ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); break; #endif /* SKYWALK */ default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } if (fq_empty(fq, fqs->fqs_ptype)) { fqs->fqs_large_flow = NULL; if (fq->fq_flags & FQF_OLD_FLOW) { fq_if_empty_old_flow(fqs, fq_cl, fq, now); } else { VERIFY(fq->fq_flags & FQF_NEW_FLOW); fq_if_empty_new_flow(fq, fq_cl); } } IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt)); pktsched_free_pkt(&pkt); fq_cl->fcl_stat.fcl_drop_overflow++; } inline void fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq) { fq_t *prev_fq; if (fqs->fqs_large_flow != NULL && fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) { fqs->fqs_large_flow = NULL; } if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) { return; } prev_fq = fqs->fqs_large_flow; if (prev_fq == NULL) { if (!fq_empty(fq, fqs->fqs_ptype)) { fqs->fqs_large_flow = fq; } return; } else if (fq->fq_bytes > prev_fq->fq_bytes) { fqs->fqs_large_flow = fq; } } boolean_t fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc, fq_t *fq, fq_if_classq_t *fq_cl) { struct flowadv_fcentry *fce; #if DEBUG || DEVELOPMENT if (__improbable(ifclassq_flow_control_adv == 0)) { os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__); return TRUE; } #endif /* DEBUG || DEVELOPMENT */ STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) { if ((uint8_t)fce->fce_flowsrc_type == flowsrc && fce->fce_flowid == fq->fq_flowhash) { /* Already on flowcontrol list */ return TRUE; } } IFCQ_CONVERT_LOCK(fqs->fqs_ifq); fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK); if (fce != NULL) { /* XXX Add number of bytes in the queue */ STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link); fq_cl->fcl_stat.fcl_flow_control++; os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, " "flow: 0x%x, iface: %s, B:%u\n", __func__, fq_cl->fcl_stat.fcl_flow_control, fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes); KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay); } return (fce != NULL) ? TRUE : FALSE; } static void fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce) { STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link); STAILQ_NEXT(fce, fce_link) = NULL; flowadv_add_entry(fce); } void fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl) { struct flowadv_fcentry *fce = NULL; IFCQ_CONVERT_LOCK(fqs->fqs_ifq); STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) { if (fce->fce_flowid == fq->fq_flowhash) { break; } } if (fce != NULL) { fq_cl->fcl_stat.fcl_flow_feedback++; fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK; os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, " "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__, fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index, fce->fce_flowsrc_type, fce->fce_flowid, if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index, fq->fq_bytes); fq_if_remove_fcentry(fqs, fce); KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END, fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay); } fq->fq_flags &= ~FQF_FLOWCTL_ON; } boolean_t fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt, uint32_t pkt_cnt) { struct flowadv_fcentry *fce; #if DEBUG || DEVELOPMENT if (__improbable(ifclassq_flow_control_adv == 0)) { os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__); return TRUE; } #endif /* DEBUG || DEVELOPMENT */ IFCQ_CONVERT_LOCK(fqs->fqs_ifq); fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK); if (fce != NULL) { fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED; fce->fce_ce_cnt = ce_cnt; fce->fce_pkts_since_last_report = pkt_cnt; flowadv_add_entry(fce); } return (fce != NULL) ? TRUE : FALSE; } void fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom, uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist, bool budget_restricted, uint64_t now, bool *fq_cl_paced, uint64_t *next_tx_time) { fq_t *fq = NULL, *tfq = NULL; flowq_stailq_t temp_stailq; uint32_t pktcnt, bytecnt; boolean_t qempty, limit_reached = FALSE; bool all_paced = true; classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_getq_flow_t fq_getq_flow_fn; classq_pkt_t *head, *tail; uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS; switch (fqs->fqs_ptype) { case QP_MBUF: fq_getq_flow_fn = fq_getq_flow_mbuf; break; #if SKYWALK case QP_PACKET: fq_getq_flow_fn = fq_getq_flow_kpkt; break; #endif /* SKYWALK */ default: VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); } /* * maximum byte limit should not be greater than the budget for * this class */ if (bytelimit > fq_cl->fcl_budget && budget_restricted) { bytelimit = fq_cl->fcl_budget; } VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL); pktcnt = bytecnt = 0; STAILQ_INIT(&temp_stailq); STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) { ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == FQF_NEW_FLOW); uint64_t fq_tx_time; if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) { ASSERT(fq_tx_time != FQ_INVALID_TX_TS); if (fq_tx_time < fq_cl_tx_time) { fq_cl_tx_time = fq_tx_time; } continue; } all_paced = false; if (fq_dqlist != NULL) { if (!fq->fq_in_dqlist) { fq_dqlist_add(fq_dqlist, fq); } head = &fq->fq_dq_head; tail = &fq->fq_dq_tail; } else { ASSERT(!fq->fq_in_dqlist); head = top; tail = &last; } limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit, pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now); /* * From RFC 8290: * if that queue has a negative number of credits (i.e., it has already * dequeued at least a quantum of bytes), it is given an additional * quantum of credits, the queue is put onto _the end of_ the list of * old queues, and the routine selects the next queue and starts again. */ if (fq->fq_deficit <= 0 || qempty) { fq->fq_deficit += fq_cl->fcl_quantum; fq_if_empty_new_flow(fq, fq_cl); } //TODO: add credit when it's now paced? so that the fq is trated the same as empty if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) { ASSERT(fq_tx_time != FQ_INVALID_TX_TS); if (fq_tx_time < fq_cl_tx_time) { fq_cl_tx_time = fq_tx_time; } } if (limit_reached) { goto done; } } STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) { VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == FQF_OLD_FLOW); bool destroy = true; uint64_t fq_tx_time; if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) { ASSERT(fq_tx_time != FQ_INVALID_TX_TS); if (fq_tx_time < fq_cl_tx_time) { fq_cl_tx_time = fq_tx_time; } continue; } all_paced = false; if (fq_dqlist != NULL) { if (!fq->fq_in_dqlist) { fq_dqlist_add(fq_dqlist, fq); } head = &fq->fq_dq_head; tail = &fq->fq_dq_tail; destroy = false; } else { ASSERT(!fq->fq_in_dqlist); head = top; tail = &last; } limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit, pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now); if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) { ASSERT(fq_tx_time != FQ_INVALID_TX_TS); if (fq_tx_time < fq_cl_tx_time) { fq_cl_tx_time = fq_tx_time; } } if (qempty) { fq_if_empty_old_flow(fqs, fq_cl, fq, now); } else if (fq->fq_deficit <= 0) { STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink); /* * Move to the end of the old queues list. We do not * need to update the flow count since this flow * will be added to the tail again */ STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink); fq->fq_deficit += fq_cl->fcl_quantum; } if (limit_reached) { break; } } done: if (all_paced) { fq_cl->fcl_flags |= FCL_PACED; fq_cl->fcl_next_tx_time = fq_cl_tx_time; } if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) { STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq); } else if (!STAILQ_EMPTY(&temp_stailq)) { fq_cl->fcl_old_flows = temp_stailq; } if (last.cp_mbuf != NULL) { VERIFY(top->cp_mbuf != NULL); if (bottom != NULL) { *bottom = last; } } if (retpktcnt != NULL) { *retpktcnt = pktcnt; } if (retbytecnt != NULL) { *retbytecnt = bytecnt; } if (fq_cl_paced != NULL) { *fq_cl_paced = all_paced; } if (next_tx_time != NULL) { *next_tx_time = fq_cl_tx_time; } } void fq_if_teardown_ifclassq(struct ifclassq *ifq) { fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; IFCQ_LOCK_ASSERT_HELD(ifq); VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL); fq_if_destroy(fqs); ifq->ifcq_disc = NULL; ifclassq_detach(ifq); } static void fq_export_flowstats(fq_if_t *fqs, fq_t *fq, struct fq_codel_flowstats *flowstat) { bzero(flowstat, sizeof(*flowstat)); flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay; flowstat->fqst_bytes = fq->fq_bytes; flowstat->fqst_flowhash = fq->fq_flowhash; if (fq->fq_flags & FQF_NEW_FLOW) { flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW; } if (fq->fq_flags & FQF_OLD_FLOW) { flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW; } if (fq->fq_flags & FQF_DELAY_HIGH) { flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH; } if (fq->fq_flags & FQF_FLOWCTL_ON) { flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON; } if (fqs->fqs_large_flow == fq) { flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW; } } int fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid, struct if_ifclassq_stats *ifqs) { struct fq_codel_classstats *fcls; fq_if_classq_t *fq_cl; fq_if_t *fqs; fq_t *fq = NULL; fq_if_group_t *grp; u_int32_t i, flowstat_cnt; if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) { return EINVAL; } fqs = (fq_if_t *)ifq->ifcq_disc; if (fqs->fqs_classq_groups[gid] == NULL) { return ENXIO; } fcls = &ifqs->ifqs_fq_codel_stats; fq_cl = &FQS_CLASSQ(fqs, gid, qid); grp = fq_if_find_grp(fqs, gid); fcls->fcls_pri = fq_cl->fcl_pri; fcls->fcls_service_class = fq_cl->fcl_service_class; fcls->fcls_quantum = fq_cl->fcl_quantum; fcls->fcls_drr_max = fq_cl->fcl_drr_max; fcls->fcls_budget = fq_cl->fcl_budget; fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S]; fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C]; fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C]; fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control; fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback; fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall; fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow; fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early; fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure; fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt; fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt; fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt; fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt; fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail; fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail; fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue; fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes; fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt; fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on; fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off; fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops; fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts; fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible; fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed; fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay; fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay; fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay; fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming; fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked; fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported; fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures; fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts; fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time; fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts; fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed; /* Gather per flow stats */ flowstat_cnt = min((fcls->fcls_newflows_cnt + fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS); i = 0; STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) { if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) { break; } /* leave space for a few old flows */ if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt && i >= (FQ_IF_MAX_FLOWSTATS >> 1)) { break; } fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]); i++; } STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) { if (i >= flowstat_cnt) { break; } fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]); i++; } VERIFY(i <= flowstat_cnt); fcls->fcls_flowstats_cnt = i; return 0; } int fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags) { #define _FQ_CLASSQ_INIT(_grp, _s, _q) \ fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX, \ FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s), \ MBUF_SC_ ## _s ); fq_if_group_t *grp; fq_if_t *fqs; uint32_t quantum, calc_flags = IF_CLASSQ_DEF; struct ifnet *ifp = ifcq->ifcq_ifp; VERIFY(grp_idx < FQ_IF_MAX_GROUPS); fqs = (fq_if_t *)ifcq->ifcq_disc; if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) { grp = fqs->fqs_classq_groups[grp_idx]; goto update; } if (fqs->fqs_classq_groups[grp_idx] != NULL) { return EINVAL; } grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO); if (grp == NULL) { return ENOMEM; } fqs->fqs_classq_groups[grp_idx] = grp; grp->fqg_index = grp_idx; quantum = fq_if_calc_quantum(ifp); if (fqs->fqs_flags & FQS_DRIVER_MANAGED) { _FQ_CLASSQ_INIT(grp, BK, quantum); _FQ_CLASSQ_INIT(grp, BE, quantum); _FQ_CLASSQ_INIT(grp, VI, quantum); _FQ_CLASSQ_INIT(grp, VO, quantum); } else { /* SIG shares same INDEX with VI */ _CASSERT(SCIDX_SIG == SCIDX_VI); _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX); _FQ_CLASSQ_INIT(grp, BK_SYS, quantum); _FQ_CLASSQ_INIT(grp, BK, quantum); _FQ_CLASSQ_INIT(grp, BE, quantum); _FQ_CLASSQ_INIT(grp, RD, quantum); _FQ_CLASSQ_INIT(grp, OAM, quantum); _FQ_CLASSQ_INIT(grp, AV, quantum); _FQ_CLASSQ_INIT(grp, RV, quantum); _FQ_CLASSQ_INIT(grp, VI, quantum); _FQ_CLASSQ_INIT(grp, VO, quantum); _FQ_CLASSQ_INIT(grp, CTL, quantum); } update: if (flags & IF_DEFAULT_GRP) { fq_if_set_grp_combined(ifcq, grp_idx); grp->fqg_flags |= FQ_IF_DEFAULT_GRP; } else { fq_if_set_grp_separated(ifcq, grp_idx); grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP; } calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY); ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C], calc_flags); ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S], calc_flags | IF_CLASSQ_L4S); ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C], calc_flags); ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S], calc_flags | IF_CLASSQ_L4S); return 0; #undef _FQ_CLASSQ_INIT } fq_if_group_t * fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx) { fq_if_group_t *grp; IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); VERIFY(grp_idx < FQ_IF_MAX_GROUPS); grp = fqs->fqs_classq_groups[grp_idx]; VERIFY(grp != NULL); return grp; } static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp) { for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) { fq_if_purge_classq(fqs, &grp->fqg_classq[i]); } bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps)); grp->fqg_len = 0; grp->fqg_bytes = 0; fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index); } void fq_if_destroy_grps(fq_if_t *fqs) { fq_if_group_t *grp; IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq); for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { if (fqs->fqs_classq_groups[grp_idx] == NULL) { continue; } grp = fq_if_find_grp(fqs, grp_idx); fq_if_purge_grp(fqs, grp); zfree(fq_if_grp_zone, grp); fqs->fqs_classq_groups[grp_idx] = NULL; } } static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx) { return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap); } void fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx) { fq_if_t *fqs; fq_if_group_t *grp; IFCQ_LOCK_ASSERT_HELD(ifcq); fqs = (fq_if_t *)ifcq->ifcq_disc; grp = fq_if_find_grp(fqs, grp_idx); if (fq_if_is_grp_combined(fqs, grp_idx)) { return; } /* * We keep the current fq_deficit and fcl_budget when combining a group. * That might disrupt the AQM but only for a moment. */ pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap); TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link); } void fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx) { fq_if_t *fqs; fq_if_group_t *grp; IFCQ_LOCK_ASSERT_HELD(ifcq); fqs = (fq_if_t *)ifcq->ifcq_disc; grp = fq_if_find_grp(fqs, grp_idx); if (!fq_if_is_grp_combined(fqs, grp_idx)) { return; } pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap); TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link); }