/* * Copyright (c) 2015-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include #include #include #include #include #define KRING_EMPTY_TX(_kring, _index) \ ((_kring)->ckr_rhead == (_index)) #define KRING_FULL_RX(_kring, _index) \ ((_kring)->ckr_khead == SLOT_NEXT((_index), (_kring)->ckr_lim)) uint32_t kern_channel_notify(const kern_channel_ring_t kring, uint32_t flags) { #pragma unused(flags) if (__improbable(KR_DROP(kring))) { return ENXIO; } return kring->ckr_na_notify(kring, kernproc, 0); } uint32_t kern_channel_reclaim(const kern_channel_ring_t kring) { return kr_reclaim(kring); } static inline uint32_t _kern_channel_available_slot_count_tx(const kern_channel_ring_t kring, slot_idx_t index) { ASSERT(kring->ckr_tx == NR_TX); if (kring->ckr_rhead < index) { return kring->ckr_num_slots + kring->ckr_rhead - index; } return kring->ckr_rhead - index; } static inline uint32_t _kern_channel_available_slot_count_rx(const kern_channel_ring_t kring, slot_idx_t index) { uint32_t busy; slot_idx_t lim = kring->ckr_lim; ASSERT(kring->ckr_tx == NR_RX); if (index < kring->ckr_khead) { busy = kring->ckr_num_slots + index - kring->ckr_khead; } else { busy = index - kring->ckr_khead; } ASSERT(lim >= busy); return lim - busy; } uint32_t kern_channel_available_slot_count(const kern_channel_ring_t kring) { if (kring->ckr_tx == NR_TX) { return _kern_channel_available_slot_count_tx(kring, kring->ckr_khead); } else { return _kern_channel_available_slot_count_rx(kring, kring->ckr_ktail); } } kern_channel_slot_t kern_channel_get_next_slot(const kern_channel_ring_t kring, const kern_channel_slot_t slot0, struct kern_slot_prop *prop) { kern_channel_slot_t slot; slot_idx_t slot_idx; /* Ensure this is only done by the thread doing a sync syscall */ VERIFY(sk_is_sync_protected()); if (__improbable(slot0 == NULL)) { if (kring->ckr_tx == NR_TX) { slot_idx = kring->ckr_khead; } else { slot_idx = kring->ckr_ktail; } } else { slot_idx = SLOT_NEXT(KR_SLOT_INDEX(kring, slot0), kring->ckr_lim); } ASSERT(slot_idx < kring->ckr_num_slots); if (kring->ckr_tx == NR_TX) { if (__improbable(KRING_EMPTY_TX(kring, slot_idx))) { SK_DF(SK_VERB_SYNC | SK_VERB_TX, "EMPTY_TX: na \"%s\" kr \"%s\" " "i %u (kc %u kt %u kl %u | rh %u rt %u)", KRNA(kring)->na_name, kring->ckr_name, slot_idx, kring->ckr_khead, kring->ckr_ktail, kring->ckr_klease, kring->ckr_rhead, kring->ckr_rtail); slot = NULL; } else { slot = &kring->ckr_ksds[slot_idx]; } } else { if (__improbable(KRING_FULL_RX(kring, slot_idx))) { SK_DF(SK_VERB_SYNC | SK_VERB_RX, "FULL_RX: na \"%s\" kr \"%s\" " "i %u (kc %u kt %u kl %u | rh %u rt %u)", KRNA(kring)->na_name, kring->ckr_name, slot_idx, kring->ckr_khead, kring->ckr_ktail, kring->ckr_klease, kring->ckr_rhead, kring->ckr_rtail); slot = NULL; } else { slot = &kring->ckr_ksds[slot_idx]; } } if (prop != NULL) { bzero(prop, sizeof(*prop)); } return slot; } static inline void _kern_channel_advance_slot_tx(const kern_channel_ring_t kring, slot_idx_t index) { /* Ensure this is only done by the thread doing a sync syscall */ VERIFY(sk_is_sync_protected()); kr_txkring_reclaim_and_refill(kring, index); } static inline void _kern_channel_advance_slot_rx(const kern_channel_ring_t kring, slot_idx_t index) { ASSERT(kring->ckr_tx == NR_RX || kring->ckr_tx == NR_EV); /* Ensure this is only done by the thread doing a sync syscall */ VERIFY(sk_is_sync_protected()); kring->ckr_ktail = SLOT_NEXT(index, kring->ckr_lim); } void kern_channel_advance_slot(const kern_channel_ring_t kring, kern_channel_slot_t slot) { slot_idx_t index = KR_SLOT_INDEX(kring, slot); ASSERT(index < kring->ckr_num_slots); if (kring->ckr_tx == NR_TX) { _kern_channel_advance_slot_tx(kring, index); } else { _kern_channel_advance_slot_rx(kring, index); } } void * kern_channel_get_context(const kern_channel_t ch) { return ch->ch_ctx; } void * kern_channel_ring_get_context(const kern_channel_ring_t kring) { return kring->ckr_ctx; } errno_t kern_channel_ring_get_container(const kern_channel_ring_t kring, kern_packet_t **array, uint32_t *count) { /* Ensure this is only done by the thread doing a sync syscall */ VERIFY(sk_is_sync_protected()); if (array == NULL) { return EINVAL; } *array = kring->ckr_scratch; if (count != NULL) { *count = na_get_nslots(kring->ckr_na, kring->ckr_tx); } return 0; } void * kern_channel_slot_get_context(const kern_channel_ring_t kring, const kern_channel_slot_t slot) { slot_idx_t i = KR_SLOT_INDEX(kring, slot); void *slot_ctx = NULL; if (kring->ckr_slot_ctxs != NULL) { slot_ctx = (void *)(kring->ckr_slot_ctxs[i].slot_ctx_arg); } return slot_ctx; } void kern_channel_increment_ring_stats(kern_channel_ring_t kring, struct kern_channel_ring_stat_increment *stats) { kr_update_stats(kring, stats->kcrsi_slots_transferred, stats->kcrsi_bytes_transferred); } void kern_channel_increment_ring_net_stats(kern_channel_ring_t kring, struct ifnet *ifp, struct kern_channel_ring_stat_increment *stats) { if (kring->ckr_tx == NR_TX) { os_atomic_add(&ifp->if_data.ifi_opackets, stats->kcrsi_slots_transferred, relaxed); os_atomic_add(&ifp->if_data.ifi_obytes, stats->kcrsi_bytes_transferred, relaxed); } else { os_atomic_add(&ifp->if_data.ifi_ipackets, stats->kcrsi_slots_transferred, relaxed); os_atomic_add(&ifp->if_data.ifi_ibytes, stats->kcrsi_bytes_transferred, relaxed); } if (ifp->if_data_threshold != 0) { ifnet_notify_data_threshold(ifp); } kr_update_stats(kring, stats->kcrsi_slots_transferred, stats->kcrsi_bytes_transferred); } kern_packet_t kern_channel_slot_get_packet(const kern_channel_ring_t kring, const kern_channel_slot_t slot) { #if (DEVELOPMENT || DEBUG) /* catch invalid slot */ slot_idx_t idx = KR_SLOT_INDEX(kring, slot); struct __kern_slot_desc *ksd = KR_KSD(kring, idx); #else #pragma unused(kring) struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot); #endif /* (DEVELOPMENT || DEBUG) */ struct __kern_quantum *kqum = ksd->sd_qum; if (__improbable(kqum == NULL || (kqum->qum_qflags & QUM_F_DROPPED) != 0)) { return 0; } return SD_GET_TAGGED_METADATA(ksd); } errno_t kern_channel_slot_attach_packet(const kern_channel_ring_t kring, const kern_channel_slot_t slot, kern_packet_t ph) { #if (DEVELOPMENT || DEBUG) /* catch invalid slot */ slot_idx_t idx = KR_SLOT_INDEX(kring, slot); struct __kern_slot_desc *ksd = KR_KSD(kring, idx); #else #pragma unused(kring) struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot); #endif /* (DEVELOPMENT || DEBUG) */ return KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph)); } errno_t kern_channel_slot_detach_packet(const kern_channel_ring_t kring, const kern_channel_slot_t slot, kern_packet_t ph) { #pragma unused(ph) #if (DEVELOPMENT || DEBUG) /* catch invalid slot */ slot_idx_t idx = KR_SLOT_INDEX(kring, slot); struct __kern_slot_desc *ksd = KR_KSD(kring, idx); #else struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot); #endif /* (DEVELOPMENT || DEBUG) */ ASSERT(SK_PTR_ADDR_KQUM(ph) == SK_PTR_ADDR_KQUM(SD_GET_TAGGED_METADATA(ksd))); (void) KR_SLOT_DETACH_METADATA(kring, ksd); return 0; } static errno_t kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring, uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending, boolean_t canblock) { #pragma unused(tx_doorbell_ctxt) struct nexus_adapter *hwna; struct ifnet *ifp; sk_protect_t protect; errno_t rc = 0; errno_t sync_err = 0; KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_START), SK_KVA(hw_kring)); VERIFY(hw_kring != NULL); hwna = KRNA(hw_kring); ifp = hwna->na_ifp; ASSERT(hwna->na_type == NA_NETIF_DEV); ASSERT(hw_kring->ckr_tx == NR_TX); *pkts_pending = FALSE; if (__improbable(pkt_limit == 0 || byte_limit == 0)) { SK_ERR("invalid limits plim %d, blim %d", pkt_limit, byte_limit); rc = EINVAL; goto out; } if (__improbable(!IF_FULLY_ATTACHED(ifp))) { SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; } if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) { SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), " "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; } /* * if the ring is busy, it means another dequeue is in * progress, so ignore this request and return success. */ if (kr_enter(hw_kring, canblock) != 0) { rc = 0; goto out; } if (__improbable(KR_DROP(hw_kring) || !NA_IS_ACTIVE(hw_kring->ckr_na))) { kr_exit(hw_kring); SK_ERR("hw-kr 0x%llx stopped", SK_KVA(hw_kring)); rc = ENXIO; goto out; } /* * Unlikely to get here, unless a channel is opened by * a user process directly to the netif. Issue a TX sync * on the netif device TX ring. */ protect = sk_sync_protect(); sync_err = hw_kring->ckr_na_sync(hw_kring, kernproc, NA_SYNCF_NETIF); sk_sync_unprotect(protect); kr_exit(hw_kring); if (rc == 0) { rc = sync_err; } out: KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_END), SK_KVA(hw_kring), rc, 0, 0); return rc; } errno_t kern_channel_tx_refill(const kern_channel_ring_t hw_kring, uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending) { if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) { return netif_ring_tx_refill(hw_kring, pkt_limit, byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE); } else { return kern_channel_tx_refill_common(hw_kring, pkt_limit, byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE); } } errno_t kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring, uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending) { if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) { return netif_ring_tx_refill(hw_kring, pkt_limit, byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE); } else { return kern_channel_tx_refill_common(hw_kring, pkt_limit, byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE); } } errno_t kern_channel_get_service_class(const kern_channel_ring_t kring, kern_packet_svc_class_t *svc) { if ((KRNA(kring)->na_type != NA_NETIF_DEV) || (kring->ckr_tx == NR_RX) || (kring->ckr_svc == KPKT_SC_UNSPEC)) { return ENOTSUP; } *svc = kring->ckr_svc; return 0; } void kern_channel_flowadv_clear(struct flowadv_fcentry *fce) { const flowadv_token_t ch_token = fce->fce_flowsrc_token; const flowadv_token_t flow_token = fce->fce_flowid; const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx; struct ifnet *ifp = fce->fce_ifp; struct nexus_adapter *hwna; struct kern_nexus *fsw_nx; struct kern_channel *ch = NULL; struct nx_flowswitch *fsw; _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); SK_LOCK(); if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) { goto done; } hwna = &ifp->if_na->nifna_up; VERIFY((hwna->na_type == NA_NETIF_DEV) || (hwna->na_type == NA_NETIF_COMPAT_DEV)); if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) { goto done; } fsw_nx = fsw->fsw_nx; VERIFY(fsw_nx != NULL); /* find the channel */ STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) { if (ch_token == ch->ch_info->cinfo_ch_token) { break; } } if (ch != NULL) { if (ch->ch_na != NULL && na_flowadv_clear(ch, flow_fidx, flow_token)) { /* trigger flow advisory kevent */ na_flowadv_event( &ch->ch_na->na_tx_rings[ch->ch_first[NR_TX]]); SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) notified of flow update", ch->ch_name, ch->ch_pid); } else if (ch->ch_na == NULL) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) is closing (flow update ignored)", ch->ch_name, ch->ch_pid); } } else { SK_ERR("channel token 0x%x fidx %u on %s not found", ch_token, flow_fidx, ifp->if_xname); } done: SK_UNLOCK(); } void kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce, uint32_t ce_cnt, uint32_t total_pkt_cnt) { const flowadv_token_t ch_token = fce->fce_flowsrc_token; const flowadv_token_t flow_token = fce->fce_flowid; const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx; struct ifnet *ifp = fce->fce_ifp; struct nexus_adapter *hwna; struct kern_nexus *fsw_nx; struct kern_channel *ch = NULL; struct nx_flowswitch *fsw; _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); SK_LOCK(); if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) { goto done; } hwna = &ifp->if_na->nifna_up; VERIFY((hwna->na_type == NA_NETIF_DEV) || (hwna->na_type == NA_NETIF_COMPAT_DEV)); if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) { goto done; } fsw_nx = fsw->fsw_nx; VERIFY(fsw_nx != NULL); /* find the channel */ STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) { if (ch_token == ch->ch_info->cinfo_ch_token) { break; } } if (ch != NULL) { if (ch->ch_na != NULL && na_flowadv_report_ce_event(ch, flow_fidx, flow_token, ce_cnt, total_pkt_cnt)) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) notified of flow update", ch->ch_name, ch->ch_pid); } else if (ch->ch_na == NULL) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) is closing (flow update ignored)", ch->ch_name, ch->ch_pid); } } else { SK_ERR("channel token 0x%x fidx %u on %s not found", ch_token, flow_fidx, ifp->if_xname); } done: SK_UNLOCK(); } void kern_channel_memstatus(struct proc *p, uint32_t status, struct kern_channel *ch) { #pragma unused(p, status) SK_LOCK_ASSERT_NOTHELD(); ASSERT(!(ch->ch_flags & CHANF_KERNEL)); ASSERT(proc_pid(p) == ch->ch_pid); /* * If we're already draining, then bail. Otherwise, check it * again via na_drain() with the channel lock held. */ if (ch->ch_na->na_flags & NAF_DRAINING) { return; } SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b status %s", sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch), ch->ch_flags, CHANF_BITS, sk_memstatus2str(status)); /* serialize accesses against channel syscalls */ lck_mtx_lock(&ch->ch_lock); na_drain(ch->ch_na, TRUE); /* purge caches */ lck_mtx_unlock(&ch->ch_lock); } static bool _kern_channel_defunct_eligible(struct kern_channel *ch) { struct nexus_upipe_adapter *pna; if ((ch->ch_info->cinfo_ch_mode & CHMODE_DEFUNCT_OK) == 0) { return false; } if (ch->ch_na->na_type != NA_USER_PIPE) { return true; } pna = (struct nexus_upipe_adapter *)ch->ch_na; if ((pna->pna_parent->na_flags & NAF_DEFUNCT_OK) == 0) { return false; } return true; } void kern_channel_defunct(struct proc *p, struct kern_channel *ch) { #pragma unused(p) uint32_t ch_mode = ch->ch_info->cinfo_ch_mode; SK_LOCK_ASSERT_NOTHELD(); ASSERT(!(ch->ch_flags & CHANF_KERNEL)); ASSERT(proc_pid(p) == ch->ch_pid); /* * If the channel is eligible for defunct, mark it as such. * Otherwise, set the draining flag which tells the reaper * thread to purge any cached objects associated with it. * That draining flag will be cleared then, which allows the * channel to cache objects again once the process is resumed. */ if (_kern_channel_defunct_eligible(ch)) { struct kern_nexus *nx = ch->ch_nexus; struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx); boolean_t need_defunct; int err; /* * This may be called often, so check first (without lock) if * the trapdoor flag CHANF_DEFUNCT has been set and bail if so, * for performance reasons. This check is repeated below with * the channel lock held. */ if (ch->ch_flags & CHANF_DEFUNCT) { return; } SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b", sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch), ch->ch_flags, CHANF_BITS); /* serialize accesses against channel syscalls */ lck_mtx_lock(&ch->ch_lock); /* * If opportunistic defunct is in effect, skip the rest of * the defunct work based on two cases: * * a) if the channel isn't using user packet pool; or * b) if the channel is using user packet pool and we * detect that there are outstanding allocations. * * Note that for case (a) above we essentially treat the * channel as ineligible for defunct, and although it may * be idle we'd leave the memory mapping intact. This * should not be a concern as the majority of channels are * on flowswitches where user packet pool is mandatory. * * If skipping, mark the channel with CHANF_DEFUNCT_SKIP * and increment the stats (for flowswitch only). */ if (sk_opp_defunct && (!(ch_mode & CHMODE_USER_PACKET_POOL) || !pp_isempty_upp(ch->ch_pp))) { if (ch->ch_na->na_type == NA_FLOWSWITCH_VP) { struct nx_flowswitch *fsw = VPNA(ch->ch_na)->vpna_fsw; STATS_INC(&fsw->fsw_stats, FSW_STATS_CHAN_DEFUNCT_SKIP); } os_atomic_or(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed); /* skip defunct */ lck_mtx_unlock(&ch->ch_lock); return; } os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed); /* * Proceed with the rest of the defunct work. */ if (os_atomic_or_orig(&ch->ch_flags, CHANF_DEFUNCT, relaxed) & CHANF_DEFUNCT) { /* already defunct; nothing to do */ lck_mtx_unlock(&ch->ch_lock); return; } /* mark this channel as inactive */ ch_deactivate(ch); /* * Redirect memory regions for the map; upon success, instruct * the nexus to finalize the defunct and teardown the respective * memory regions. It's crucial that the redirection happens * first before freeing the objects, since the page protection * flags get inherited only from unfreed segments. Freed ones * will cause VM_PROT_NONE to be used for the segment span, to * catch use-after-free cases. For unfreed objects, doing so * may cause an exception when the process is later resumed * and touches an address within the span; hence the ordering. */ if ((err = skmem_arena_mredirect(ch->ch_na->na_arena, &ch->ch_mmap, p, &need_defunct)) == 0 && need_defunct) { /* * Let the domain provider handle the initial tasks of * the defunct that are specific to this channel. It * may safely free objects as the redirection is done. */ nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov, nx, ch, p); /* * Let the domain provider complete the defunct; * do this after dropping the channel lock, as * the nexus may end up acquiring other locks * that would otherwise violate lock ordering. * The channel refcnt is still held by virtue * of the caller holding the process's file * table lock. */ lck_mtx_unlock(&ch->ch_lock); nxdom_prov->nxdom_prov_dom->nxdom_defunct_finalize( nxdom_prov, nx, ch, FALSE); } else if (err == 0) { /* * Let the domain provider handle the initial tasks of * the defunct that are specific to this channel. It * may sadely free objects as the redirection is done. */ nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov, nx, ch, p); lck_mtx_unlock(&ch->ch_lock); } else { /* already redirected; nothing to do */ lck_mtx_unlock(&ch->ch_lock); } } else { lck_mtx_lock(&ch->ch_lock); na_drain(ch->ch_na, FALSE); /* prune caches */ lck_mtx_unlock(&ch->ch_lock); } }