/* * Copyright (c) 2015-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. * All rights reserved. * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) struct ch_event_result { uint32_t tx_data; uint32_t rx_data; }; static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock"); static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock"); LCK_ATTR_DECLARE(channel_lock_attr, 0, 0); static void csi_selrecord(struct ch_selinfo *, struct proc *, void *); static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t); static inline void csi_selwakeup_delayed(struct ch_selinfo *); static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t, boolean_t, boolean_t, uint32_t); static boolean_t csi_tcall_start(struct ch_selinfo *); static void csi_tcall(thread_call_param_t, thread_call_param_t); static uint64_t csi_tcall_update_interval(struct ch_selinfo *); static void ch_redzone_init(void); static void ch_close_common(struct kern_channel *, boolean_t, boolean_t); static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t, ring_id_t); static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx, struct ch_ev_thresh *); static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *, struct kern_channel *, struct nxbind *, struct proc *, int, int *); static void ch_disconnect(struct kern_channel *); static int ch_set_lowat_thresh(struct kern_channel *, enum txrx, struct sockopt *); static int ch_get_lowat_thresh(struct kern_channel *, enum txrx, struct sockopt *); static struct kern_channel *ch_alloc(zalloc_flags_t); static void ch_free(struct kern_channel *); static int ch_configure_interface_advisory_event(struct kern_channel *ch, struct sockopt *sopt); static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev); static void filt_chrwdetach(struct knote *, boolean_t); static void filt_chrdetach(struct knote *); static void filt_chwdetach(struct knote *); static int filt_chrw(struct knote *, long, int); static int filt_chread(struct knote *, long); static int filt_chwrite(struct knote *, long); static int filt_chtouch(struct knote *, struct kevent_qos_s *, int); static int filt_chrtouch(struct knote *, struct kevent_qos_s *); static int filt_chwtouch(struct knote *, struct kevent_qos_s *); static int filt_chprocess(struct knote *, struct kevent_qos_s *, int); static int filt_chrprocess(struct knote *, struct kevent_qos_s *); static int filt_chwprocess(struct knote *, struct kevent_qos_s *); static int filt_che_attach(struct knote *, struct kevent_qos_s *kev); static void filt_che_detach(struct knote *); static int filt_che_event(struct knote *, long); static int filt_che_touch(struct knote *, struct kevent_qos_s *); static int filt_che_process(struct knote *, struct kevent_qos_s *); static int filt_chan_extended_common(struct knote *, long); static int ch_event(struct kern_channel *ch, int events, void *wql, struct proc *p, struct ch_event_result *, const boolean_t is_kevent, int *errno, const boolean_t); const struct filterops skywalk_channel_rfiltops = { .f_isfd = 1, .f_attach = filt_chrwattach, .f_detach = filt_chrdetach, .f_event = filt_chread, .f_touch = filt_chrtouch, .f_process = filt_chrprocess, }; const struct filterops skywalk_channel_wfiltops = { .f_isfd = 1, .f_attach = filt_chrwattach, .f_detach = filt_chwdetach, .f_event = filt_chwrite, .f_touch = filt_chwtouch, .f_process = filt_chwprocess, }; const struct filterops skywalk_channel_efiltops = { .f_isfd = 1, .f_attach = filt_che_attach, .f_detach = filt_che_detach, .f_event = filt_che_event, .f_touch = filt_che_touch, .f_process = filt_che_process, }; /* mitigation intervals in ns */ #define CH_MIT_IVAL_MIN NSEC_PER_USEC static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT; #if (DEVELOPMENT || DEBUG) SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters"); SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival, CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, ""); #endif /* !DEVELOPMENT && !DEBUG */ static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel); static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info); static int __ch_inited = 0; /* * Global cookies to hold the random numbers used for verifying * user metadata red zone violations. */ uint64_t __ch_umd_redzone_cookie = 0; #define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key" SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY); static void ch_redzone_init(void) { _CASSERT(sizeof(__ch_umd_redzone_cookie) == sizeof(((struct __metadata_preamble *)0)->mdp_redzone)); _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble)); _CASSERT(sizeof(struct __slot_desc) == 8); /* Initialize random user red zone cookie values */ do { read_random(&__ch_umd_redzone_cookie, sizeof(__ch_umd_redzone_cookie)); } while (__ch_umd_redzone_cookie == 0); SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie); } int channel_init(void) { int error = 0; SK_LOCK_ASSERT_HELD(); ASSERT(!__ch_inited); _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0); _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0); ch_redzone_init(); __ch_inited = 1; return error; } void channel_fini(void) { SK_LOCK_ASSERT_HELD(); if (__ch_inited) { __ch_umd_redzone_cookie = 0; __ch_inited = 0; } } void csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival) { csi->csi_flags = 0; csi->csi_pending = 0; if (mitigation) { csi->csi_interval = mit_ival; csi->csi_eff_interval = ch_mit_ival; /* global override */ os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed); csi->csi_tcall = thread_call_allocate_with_options(csi_tcall, csi, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); /* this must not fail */ VERIFY(csi->csi_tcall != NULL); } else { csi->csi_interval = 0; csi->csi_eff_interval = 0; csi->csi_tcall = NULL; } lck_mtx_init(&csi->csi_lock, &channel_kn_lock_group, &channel_lock_attr); klist_init(&csi->csi_si.si_note); } void csi_destroy(struct ch_selinfo *csi) { /* check if not already destroyed, else do it now */ if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) & CSI_DESTROYED) == 0) { CSI_LOCK(csi); /* must have been set by above atomic op */ VERIFY(csi->csi_flags & CSI_DESTROYED); if (csi->csi_flags & CSI_MITIGATION) { thread_call_t tcall = csi->csi_tcall; VERIFY(tcall != NULL); CSI_UNLOCK(csi); (void) thread_call_cancel_wait(tcall); if (!thread_call_free(tcall)) { boolean_t freed; (void) thread_call_cancel_wait(tcall); freed = thread_call_free(tcall); VERIFY(freed); } CSI_LOCK(csi); csi->csi_tcall = NULL; os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION, relaxed); } csi->csi_pending = 0; CSI_UNLOCK(csi); selthreadclear(&csi->csi_si); /* now we don't need the mutex anymore */ lck_mtx_destroy(&csi->csi_lock, &channel_kn_lock_group); } } /* * Called only for select(2). */ __attribute__((always_inline)) static inline void csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql) { struct selinfo *si = &csi->csi_si; CSI_LOCK_ASSERT_HELD(csi); selrecord(p, si, wql); } void csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql) { struct ch_selinfo *csi = &kring->ckr_si; CSI_LOCK(csi); SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) " "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags); csi_selrecord(csi, p, wql); CSI_UNLOCK(csi); } void csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p, void *wql) { struct ch_selinfo *csi = &na->na_si[t]; CSI_LOCK(csi); SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x", (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na), SK_KVA(&csi->csi_si), csi->csi_si.si_flags); csi_selrecord(csi, p, wql); CSI_UNLOCK(csi); } /* * Called from na_post_event(). */ __attribute__((always_inline)) static inline void csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent, boolean_t selwake, uint32_t hint) { struct selinfo *si = &csi->csi_si; CSI_LOCK_ASSERT_HELD(csi); csi->csi_pending = 0; if (selwake) { selwakeup(si); } if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) { KNOTE(&si->si_note, hint); } } __attribute__((always_inline)) static inline void csi_selwakeup_delayed(struct ch_selinfo *csi) { CSI_LOCK_ASSERT_HELD(csi); ASSERT(csi->csi_flags & CSI_MITIGATION); ASSERT(csi->csi_tcall != NULL); if (thread_call_isactive(csi->csi_tcall)) { csi->csi_pending++; } else if (!csi_tcall_start(csi)) { csi_selwakeup(csi, FALSE, FALSE, 0); } } __attribute__((always_inline)) static inline void csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay, boolean_t within_kevent, boolean_t selwake, uint32_t hint) { CSI_LOCK_ASSERT_HELD(csi); if (nodelay || within_kevent || !selwake || hint != 0 || !(csi->csi_flags & CSI_MITIGATION)) { csi_selwakeup(csi, within_kevent, selwake, hint); } else { csi_selwakeup_delayed(csi); } } void csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay, boolean_t within_kevent, boolean_t selwake, uint32_t hint) { struct ch_selinfo *csi = &kring->ckr_si; CSI_LOCK(csi); SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) " "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b", (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay, within_kevent, selwake, hint, CHAN_FILT_HINT_BITS); csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint); CSI_UNLOCK(csi); } void csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay, boolean_t within_kevent, boolean_t selwake, uint32_t hint) { struct ch_selinfo *csi = &na->na_si[t]; CSI_LOCK(csi); SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx " "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b", (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na), SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay, within_kevent, selwake, hint, CHAN_FILT_HINT_BITS); switch (t) { case NR_RX: if (!(na->na_flags & NAF_RX_MITIGATION)) { nodelay = TRUE; } break; case NR_TX: if (!(na->na_flags & NAF_TX_MITIGATION)) { nodelay = TRUE; } break; default: nodelay = TRUE; break; } csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint); CSI_UNLOCK(csi); } static boolean_t csi_tcall_start(struct ch_selinfo *csi) { uint64_t now, ival, deadline; CSI_LOCK_ASSERT_HELD(csi); ASSERT(csi->csi_flags & CSI_MITIGATION); ASSERT(csi->csi_tcall != NULL); /* pick up latest value */ ival = csi_tcall_update_interval(csi); /* if no mitigation, pass notification up now */ if (__improbable(ival == 0)) { return FALSE; } deadline = now = mach_absolute_time(); clock_deadline_for_periodic_event(ival, now, &deadline); (void) thread_call_enter_delayed(csi->csi_tcall, deadline); return TRUE; } static void csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1) { #pragma unused(arg1) struct ch_selinfo *csi = arg0; CSI_LOCK(csi); csi_selwakeup(csi, FALSE, FALSE, 0); CSI_UNLOCK(csi); CSI_LOCK(csi); if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 && csi->csi_pending != 0 && !csi_tcall_start(csi))) { csi_selwakeup(csi, FALSE, FALSE, 0); } CSI_UNLOCK(csi); } __attribute__((always_inline)) static inline uint64_t csi_tcall_update_interval(struct ch_selinfo *csi) { uint64_t i = ch_mit_ival; /* if global override was adjusted, update local copies */ if (__improbable(csi->csi_eff_interval != i)) { ASSERT(csi->csi_flags & CSI_MITIGATION); csi->csi_interval = csi->csi_eff_interval = ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN)); } return csi->csi_interval; } /* return EV_EOF if the channel is defunct */ static inline boolean_t ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn) { if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) { if (kn) { kn->kn_flags |= EV_EOF; } return TRUE; } return FALSE; } static void filt_chrwdetach(struct knote *kn, boolean_t write) { struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); struct ch_selinfo *csi; struct selinfo *si; lck_mtx_lock(&ch->ch_lock); csi = ch->ch_si[write ? NR_TX : NR_RX]; si = &csi->csi_si; CSI_LOCK(csi); SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) " "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", write ? "write" : "read", si->si_flags); if (KNOTE_DETACH(&si->si_note, kn)) { os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed); } CSI_UNLOCK(csi); lck_mtx_unlock(&ch->ch_lock); } static void filt_chrdetach(struct knote *kn) { ASSERT(kn->kn_filter == EVFILT_READ); filt_chrwdetach(kn, FALSE); } static void filt_chwdetach(struct knote *kn) { ASSERT(kn->kn_filter == EVFILT_WRITE); filt_chrwdetach(kn, TRUE); } /* * callback from notifies (generated externally). * This always marks the knote activated, so always * return 1. */ static int filt_chrw(struct knote *kn, long hint, int events) { #if SK_LOG struct kern_channel *ch = knote_kn_hook_get_raw(kn); #else #pragma unused(kn) #pragma unused(hint) #pragma unused(events) #endif SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx " "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", (events == POLLOUT) ? "write" : "read", (uint32_t)hint); /* assume we are ready */ return 1; } static int filt_chread(struct knote *kn, long hint) { ASSERT(kn->kn_filter == EVFILT_READ); /* There is no hint for read/write event */ if (hint != 0) { return 0; } return filt_chrw(kn, hint, POLLIN); } static int filt_chwrite(struct knote *kn, long hint) { ASSERT(kn->kn_filter == EVFILT_WRITE); /* There is no hint for read/write event */ if (hint != 0) { return 0; } return filt_chrw(kn, hint, POLLOUT); } static int filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events) { #pragma unused(kev) struct kern_channel *ch = knote_kn_hook_get_raw(kn); int ev = kn->kn_filter; enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX; int event_error = 0; int revents; /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; lck_mtx_lock(&ch->ch_lock); if (__improbable(ch_filt_check_defunct(ch, kn))) { lck_mtx_unlock(&ch->ch_lock); return 1; } /* if a note-specific low watermark is given, validate it */ if (kn->kn_sfflags & NOTE_LOWAT) { struct ch_ev_thresh note_thresh = { .cet_unit = (dir == NR_TX) ? ch->ch_info->cinfo_tx_lowat.cet_unit : ch->ch_info->cinfo_rx_lowat.cet_unit, .cet_value = (uint32_t)kn->kn_sdata }; if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir, ¬e_thresh) != 0) { SK_ERR("invalid NOTE_LOWAT threshold %u", note_thresh.cet_value); knote_set_error(kn, EINVAL); lck_mtx_unlock(&ch->ch_lock); return 1; } } /* capture new state just so we can return it */ revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE, &event_error, FALSE); lck_mtx_unlock(&ch->ch_lock); if (revents & POLLERR) { ASSERT(event_error != 0); /* * Setting a knote error here will confuse libdispatch, so we * use EV_EOF instead. */ kn->kn_flags |= EV_EOF; return 1; } else { return (events & revents) != 0; } } static int filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev) { ASSERT(kn->kn_filter == EVFILT_READ); if (kev->flags & EV_ENABLE) { KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE), kn->kn_udata, kn->kn_status | (kn->kn_id << 32), kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM( ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na)); } return filt_chtouch(kn, kev, POLLIN); } static int filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev) { ASSERT(kn->kn_filter == EVFILT_WRITE); return filt_chtouch(kn, kev, POLLOUT); } /* * Called from kevent. We call ch_event(POLL[IN|OUT]) and * return 0/1 accordingly. */ static int filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events) { struct kern_channel *ch = knote_kn_hook_get_raw(kn); struct ch_event_result result; uint32_t lowat; int trigger_event = 1; int revents; int event_error; int64_t data; lck_mtx_lock(&ch->ch_lock); if (__improbable(ch_filt_check_defunct(ch, kn))) { knote_fill_kevent(kn, kev, 0); lck_mtx_unlock(&ch->ch_lock); return 1; } revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, &result, TRUE, &event_error, FALSE); if (revents & POLLERR) { ASSERT(event_error != 0); lck_mtx_unlock(&ch->ch_lock); /* * Setting a knote error here will confuse libdispatch, so we * use EV_EOF instead. */ kn->kn_flags |= EV_EOF; knote_fill_kevent_with_sdata(kn, kev); return 1; } trigger_event = (events & revents) != 0; if (events == POLLOUT) { lowat = ch->ch_info->cinfo_tx_lowat.cet_value; if ((kn->kn_sfflags & NOTE_LOWAT) && kn->kn_sdata > lowat) { lowat = (uint32_t)kn->kn_sdata; } data = result.tx_data; if (result.tx_data < lowat) { trigger_event = 0; } } else { lowat = ch->ch_info->cinfo_rx_lowat.cet_value; if ((kn->kn_sfflags & NOTE_LOWAT) && kn->kn_sdata > lowat) { lowat = (uint32_t)kn->kn_sdata; } data = result.rx_data; if (result.rx_data < lowat) { trigger_event = 0; } } if (trigger_event) { knote_fill_kevent(kn, kev, data); } lck_mtx_unlock(&ch->ch_lock); return trigger_event; } static int filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev) { ASSERT(kn->kn_filter == EVFILT_READ); return filt_chprocess(kn, kev, POLLIN); } static int filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev) { ASSERT(kn->kn_filter == EVFILT_WRITE); return filt_chprocess(kn, kev, POLLOUT); } static int filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); struct nexus_adapter *na; struct ch_selinfo *csi; int ev = kn->kn_filter; enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX; int revents; int events; int event_error = 0; ASSERT((kn->kn_filter == EVFILT_READ) || (kn->kn_filter == EVFILT_WRITE)); /* ch_kqfilter() should have acquired the lock */ LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); na = ch->ch_na; /* if a note-specific low watermark is given, validate it */ if (kn->kn_sfflags & NOTE_LOWAT) { struct ch_ev_thresh note_thresh = { .cet_unit = (dir == NR_TX) ? ch->ch_info->cinfo_tx_lowat.cet_unit : ch->ch_info->cinfo_rx_lowat.cet_unit, .cet_value = (uint32_t)kn->kn_sdata }; if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir, ¬e_thresh) != 0) { SK_ERR("invalid NOTE_LOWAT threshold %u", note_thresh.cet_value); knote_set_error(kn, EINVAL); return 0; } } /* the si is indicated in the channel */ csi = ch->ch_si[dir]; CSI_LOCK(csi); if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) { os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed); } CSI_UNLOCK(csi); SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)", na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", (ev == EVFILT_WRITE) ? "write" : "read"); /* capture current state */ events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN; if (__improbable(ch_filt_check_defunct(ch, kn))) { revents = events; } else { /* filt_chprocess() will fill in the kn_sdata field */ revents = ch_event(ch, events, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE, &event_error, FALSE); } if (revents & POLLERR) { ASSERT(event_error != 0); kn->kn_flags |= EV_EOF; return 1; } else { return (events & revents) != 0; } } static int filt_chan_extended_common(struct knote *kn, long ev_hint) { /* * This function is not always called with the same set of locks held, * hence it is only allowed to manipulate kn_fflags, with atomics. * * the f_event / f_process functions may run concurrently. */ uint32_t add_fflags = 0; if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) { add_fflags |= NOTE_FLOW_ADV_UPDATE; } if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) { add_fflags |= NOTE_CHANNEL_EVENT; } if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) { add_fflags |= NOTE_IF_ADV_UPD; } if (add_fflags) { /* Reset any events that are not requested on this knote */ add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK); os_atomic_or(&kn->kn_fflags, add_fflags, relaxed); return add_fflags != 0; } return os_atomic_load(&kn->kn_fflags, relaxed) != 0; } static inline void che_process_channel_event(struct kern_channel *ch, struct knote *kn, uint32_t fflags, long *hint) { int revents, event_error = 0; LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT; if (((ch->ch_flags & CHANF_EVENT_RING) != 0) && ((fflags & NOTE_CHANNEL_EVENT) != 0)) { /* capture new state to return */ revents = ch_event(ch, POLLIN, NULL, knote_get_kq(kn)->kq_p, NULL, TRUE, &event_error, TRUE); if (revents & POLLERR) { ASSERT(event_error != 0); /* * Setting a knote error here will confuse libdispatch, * so we use EV_EOF instead. */ kn->kn_flags |= EV_EOF; } else if ((revents & POLLIN) != 0) { *hint |= CHAN_FILT_HINT_CHANNEL_EVENT; } } /* * if the sync operation on event ring didn't find any events * then indicate that the channel event is not active. */ if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) { /* * Avoid a costly atomic when the bit is already cleared. */ uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed); if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) { os_atomic_andnot(&kn->kn_fflags, CHAN_FILT_HINT_CHANNEL_EVENT, relaxed); } } } static int filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); struct ch_selinfo *csi; long hint = 0; _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE); _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT); _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD); ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); /* ch_kqfilter() should have acquired the lock */ LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); csi = ch->ch_si[NR_TX]; CSI_LOCK(csi); if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) { os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed); } CSI_UNLOCK(csi); if (__improbable(ch_filt_check_defunct(ch, kn))) { return 1; } if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) { os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed); } che_process_channel_event(ch, kn, kn->kn_sfflags, &hint); if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) { /* on registration force an event */ hint |= CHAN_FILT_HINT_FLOW_ADV_UPD; } SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), "EVFILT_NW_CHANNEL"); return filt_chan_extended_common(kn, hint); } static void filt_che_detach(struct knote *kn) { struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); struct ch_selinfo *csi; ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); lck_mtx_lock(&ch->ch_lock); if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) { os_atomic_andnot(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed); } csi = ch->ch_si[NR_TX]; CSI_LOCK(csi); if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) { os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed); } CSI_UNLOCK(csi); lck_mtx_unlock(&ch->ch_lock); SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), "EVFILT_NW_CHANNEL"); } static int filt_che_event(struct knote *kn, long hint) { struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); if (hint == 0) { return 0; } if (__improbable(ch_filt_check_defunct(ch, NULL))) { return 1; } if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) { VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0); } SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint, CHAN_FILT_HINT_BITS); return filt_chan_extended_common(kn, hint); } static int filt_che_touch(struct knote *kn, struct kevent_qos_s *kev) { int ret; long hint = 0; struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn); ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; lck_mtx_lock(&ch->ch_lock); if (__improbable(ch_filt_check_defunct(ch, kn))) { ret = 1; goto done; } if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) { if (kev->flags & EV_ENABLE) { os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed); } else if (kev->flags & EV_DISABLE) { os_atomic_andnot(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed); } } che_process_channel_event(ch, kn, kn->kn_sfflags, &hint); ret = filt_chan_extended_common(kn, hint); done: lck_mtx_unlock(&ch->ch_lock); return ret; } static int filt_che_process(struct knote *kn, struct kevent_qos_s *kev) { int ret; long hint = 0; struct kern_channel *ch = knote_kn_hook_get_raw(kn); ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); lck_mtx_lock(&ch->ch_lock); if (__improbable(ch_filt_check_defunct(ch, kn))) { ret = 1; goto done; } che_process_channel_event(ch, kn, kn->kn_sfflags, &hint); ret = filt_chan_extended_common(kn, hint); done: lck_mtx_unlock(&ch->ch_lock); if (ret != 0) { /* * This filter historically behaves like EV_CLEAR, * even when EV_CLEAR wasn't set. */ knote_fill_kevent(kn, kev, 0); kn->kn_fflags = 0; } return ret; } int ch_kqfilter(struct kern_channel *ch, struct knote *kn, struct kevent_qos_s *kev) { int result; lck_mtx_lock(&ch->ch_lock); VERIFY(!(ch->ch_flags & CHANF_KERNEL)); if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) || na_reject_channel(ch, ch->ch_na))) { SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name, ch->ch_pid, ch->ch_flags, CHANF_BITS); knote_set_error(kn, ENXIO); lck_mtx_unlock(&ch->ch_lock); return 0; } switch (kn->kn_filter) { case EVFILT_READ: kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R; break; case EVFILT_WRITE: kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W; break; case EVFILT_NW_CHANNEL: kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E; break; default: lck_mtx_unlock(&ch->ch_lock); SK_ERR("%s(%d): bad filter request %d", ch->ch_name, ch->ch_pid, kn->kn_filter); knote_set_error(kn, EINVAL); return 0; } knote_kn_hook_set_raw(kn, ch); /* call the appropriate sub-filter attach with the channel lock held */ result = knote_fops(kn)->f_attach(kn, kev); lck_mtx_unlock(&ch->ch_lock); return result; } boolean_t ch_is_multiplex(struct kern_channel *ch, enum txrx t) { return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1); } int ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p) { int revents; int event_error = 0; lck_mtx_lock(&ch->ch_lock); revents = ch_event(ch, events, wql, p, NULL, FALSE, &event_error, FALSE); lck_mtx_unlock(&ch->ch_lock); ASSERT((revents & POLLERR) == 0 || event_error != 0); return revents; } #if SK_LOG /* Hoisted out of line to reduce kernel stack footprint */ SK_LOG_ATTRIBUTE static void ch_event_log(const char *prefix, const struct kern_channel *ch, struct proc *p, const struct nexus_adapter *na, int events, int revents) { SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) " "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na), SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(current_thread()), events, revents); } #endif /* SK_LOG */ /* * select(2), poll(2) and kevent(2) handlers for channels. * * Can be called for one or more rings. Return true the event mask * corresponding to ready events. If there are no ready events, do * a selrecord on either individual selinfo or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. */ static int ch_event(struct kern_channel *ch, int events, void *wql, struct proc *p, struct ch_event_result *result, const boolean_t is_kevent, int *errno, const boolean_t is_ch_event) { struct nexus_adapter *na; struct __kern_channel_ring *kring; uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0; uint32_t ready_tx_data = 0, ready_rx_data = 0; sk_protect_t protect = NULL; #define want_tx want[NR_TX] #define want_rx want[NR_RX] /* * In order to avoid nested locks, we need to "double check" * txsync and rxsync if we decide to do a selrecord(). * retry_tx (and retry_rx, later) prevent looping forever. */ boolean_t retry_tx = TRUE, retry_rx = TRUE; int found, error = 0; int s; net_update_uptime(); LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); ASSERT(!(ch->ch_flags & CHANF_KERNEL)); *errno = 0; if (__improbable((ch->ch_flags & CHANF_DEFUNCT) || ch->ch_schema == NULL)) { SK_ERR("%s(%d): channel is defunct or no longer bound", ch->ch_name, ch->ch_pid); revents = POLLERR; *errno = ENXIO; goto done; } /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */ if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) { os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed); } na = ch->ch_na; if (__improbable(na == NULL || !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) { SK_ERR("%s(%d): channel is non-permissive", ch->ch_name, ch->ch_pid); revents = POLLERR; *errno = ENXIO; goto done; } /* mark thread with sync-in-progress flag */ protect = sk_sync_protect(); /* update our work timestamp */ na->na_work_ts = _net_uptime; /* and make this channel eligible for draining again */ if (na->na_flags & NAF_DRAINING) { os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed); } #if SK_LOG if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) { ch_event_log("enter", ch, p, na, events, revents); } #endif if (is_ch_event) { goto process_channel_event; } want_tx = (events & (POLLOUT | POLLWRNORM)); want_rx = (events & (POLLIN | POLLRDNORM)); /* * check_all_{tx|rx} are set if the channel has more than one ring * AND the file descriptor is bound to all of them. If so, we sleep * on the "global" selinfo, otherwise we sleep on individual selinfo * The interrupt routine in the driver wake one or the other (or both) * depending on which clients are active. * * rxsync() is only called if we run out of buffers on a POLLIN. * txsync() is called if we run out of buffers on POLLOUT. */ check_all_tx = ch_is_multiplex(ch, NR_TX); check_all_rx = ch_is_multiplex(ch, NR_RX); /* * If want_tx is still set, we must issue txsync calls * (on all rings, to avoid that the tx rings stall). * XXX should also check head != khead on the tx rings. */ if (want_tx) { ring_id_t first_tx = ch->ch_first[NR_TX]; ring_id_t last_tx = ch->ch_last[NR_TX]; channel_threshold_unit_t tx_unit = ch->ch_info->cinfo_tx_lowat.cet_unit; /* * The first round checks if anyone is ready, if not * do a selrecord and another round to handle races. * want_tx goes to 0 if any space is found, and is * used to skip rings with no pending transmissions. */ flush_tx: for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) { kring = &na->na_tx_rings[i]; if (!want_tx && kring->ckr_ring->ring_head == kring->ckr_khead) { continue; } /* only one thread does txsync */ s = kr_enter(kring, TRUE); ASSERT(s == 0); error = 0; DTRACE_SKYWALK2(pretxprologue, struct kern_channel *, ch, struct __kern_channel_ring *, kring); if (kr_txsync_prologue(ch, kring, p) >= kring->ckr_num_slots) { kr_log_bad_ring(kring); revents |= POLLERR; error = EFAULT; if (*errno == 0) { *errno = EFAULT; } } else { if (kring->ckr_na_sync(kring, p, 0)) { revents |= POLLERR; error = EIO; if (*errno == 0) { *errno = EIO; } } else { kr_txsync_finalize(ch, kring, p); } } DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *, ch, struct __kern_channel_ring *, kring, int, error); /* * If we found new slots, notify potential listeners on * the same ring. Since we just did a txsync, look at * the copies of cur,tail in the kring. */ found = kring->ckr_rhead != kring->ckr_rtail; kr_exit(kring); if (found) { /* notify other listeners */ revents |= want_tx; want_tx = 0; (void) kring->ckr_na_notify(kring, p, (is_kevent ? NA_NOTEF_IN_KEVENT : 0)); } /* * Add this ring's free data to our running * tally for userspace. */ if (result != NULL) { switch (tx_unit) { case CHANNEL_THRESHOLD_UNIT_BYTES: ready_tx_data += kring->ckr_ready_bytes; break; case CHANNEL_THRESHOLD_UNIT_SLOTS: ready_tx_data += kring->ckr_ready_slots; break; } } } if (want_tx && retry_tx && !is_kevent) { if (check_all_tx) { csi_selrecord_all(na, NR_TX, p, wql); } else { csi_selrecord_one(&na->na_tx_rings[first_tx], p, wql); } retry_tx = FALSE; goto flush_tx; } } /* * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { ring_id_t first_rx = ch->ch_first[NR_RX]; ring_id_t last_rx = ch->ch_last[NR_RX]; channel_threshold_unit_t rx_unit = ch->ch_info->cinfo_rx_lowat.cet_unit; /* two rounds here for race avoidance */ do_retry_rx: for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) { kring = &na->na_rx_rings[i]; /* only one thread does rxsync */ s = kr_enter(kring, TRUE); ASSERT(s == 0); error = 0; DTRACE_SKYWALK2(prerxprologue, struct kern_channel *, ch, struct __kern_channel_ring *, kring); if (kr_rxsync_prologue(ch, kring, p) >= kring->ckr_num_slots) { kr_log_bad_ring(kring); revents |= POLLERR; error = EFAULT; if (*errno == 0) { *errno = EFAULT; } } else { /* now we can use kring->rhead, rtail */ if (kring->ckr_na_sync(kring, p, 0)) { revents |= POLLERR; error = EIO; if (*errno == 0) { *errno = EIO; } } else { kr_rxsync_finalize(ch, kring, p); } } DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *, ch, struct __kern_channel_ring *, kring, int, error); found = kring->ckr_rhead != kring->ckr_rtail; kr_exit(kring); if (found) { revents |= want_rx; retry_rx = FALSE; (void) kring->ckr_na_notify(kring, p, (is_kevent ? NA_NOTEF_IN_KEVENT : 0)); } /* * Add this ring's readable data to our running * tally for userspace. */ if (result != NULL) { switch (rx_unit) { case CHANNEL_THRESHOLD_UNIT_BYTES: ready_rx_data += kring->ckr_ready_bytes; break; case CHANNEL_THRESHOLD_UNIT_SLOTS: ready_rx_data += kring->ckr_ready_slots; break; } } } if (retry_rx && !is_kevent) { if (check_all_rx) { csi_selrecord_all(na, NR_RX, p, wql); } else { csi_selrecord_one(&na->na_rx_rings[first_rx], p, wql); } } if (retry_rx) { retry_rx = FALSE; goto do_retry_rx; } } if (result != NULL) { result->tx_data = ready_tx_data; result->rx_data = ready_rx_data; } goto skip_channel_event; process_channel_event: /* * perform sync operation on the event ring to make the channel * events enqueued in the ring visible to user-space. */ /* select() and poll() not supported for event ring */ ASSERT(is_kevent); VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1); kring = &na->na_event_rings[ch->ch_first[NR_EV]]; /* only one thread does the sync */ s = kr_enter(kring, TRUE); ASSERT(s == 0); if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) { kr_log_bad_ring(kring); revents |= POLLERR; if (*errno == 0) { *errno = EFAULT; } } else { if (kring->ckr_na_sync(kring, p, 0)) { revents |= POLLERR; if (*errno == 0) { *errno = EIO; } } else { kr_event_sync_finalize(ch, kring, p); } } found = (kring->ckr_rhead != kring->ckr_rtail); kr_exit(kring); if (found) { revents |= (events & POLLIN); } skip_channel_event: #if SK_LOG if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) { ch_event_log("exit", ch, p, na, events, revents); } #endif /* SK_LOG */ /* unmark thread with sync-in-progress flag */ sk_sync_unprotect(protect); done: ASSERT(!sk_is_sync_protected()); return revents; #undef want_tx #undef want_rx } static struct kern_channel * ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id) { struct kern_channel *ch; SK_LOCK_ASSERT_HELD(); STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) { struct ch_info *cinfo = ch->ch_info; /* see comments in ch_open() */ if (cinfo->cinfo_nx_port != port) { continue; } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) { continue; } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY && ring_id != cinfo->cinfo_ch_ring_id && ring_id != CHANNEL_RING_ID_ANY) { continue; } /* found a match */ break; } if (ch != NULL) { ch_retain_locked(ch); } return ch; } #if SK_LOG /* Hoisted out of line to reduce kernel stack footprint */ SK_LOG_ATTRIBUTE static void ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port) { uuid_string_t uuidstr; SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u", sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p), sk_uuid_unparse(p_uuid, uuidstr), port); } SK_LOG_ATTRIBUTE static void ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring, uint32_t mode, const char *mode_bits, int err) { SK_D("%s(%d) port %u ring %d mode 0x%b err %d", sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring, mode, mode_bits, err); } #endif /* SK_LOG */ struct kern_channel * ch_open(struct ch_init *init, struct proc *p, int fd, int *err) { uint32_t mode = init->ci_ch_mode; nexus_port_t port = init->ci_nx_port; ring_id_t ring = init->ci_ch_ring_id; struct kern_channel *ch = NULL, *ch0 = NULL; struct nxbind *nxb = NULL; struct kern_nexus *nx; struct chreq chr; uuid_t p_uuid; kauth_cred_t cred; cred = kauth_cred_get(); ASSERT(!uuid_is_null(init->ci_nx_uuid)); proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid)); *err = 0; /* make sure we don't allow userland to set kernel-only flags */ mode &= CHMODE_MASK; SK_LOCK(); nx = nx_find(init->ci_nx_uuid, TRUE); if (nx == NULL) { *err = ENOENT; goto done; } /* port (zero-based) must be within the domain's range */ if (port >= NXDOM_MAX(NX_DOM(nx), ports)) { *err = EDOM; goto done; } VERIFY(port != NEXUS_PORT_ANY); if (mode & CHMODE_LOW_LATENCY) { if ((*err = skywalk_priv_check_cred(p, cred, PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) { goto done; } } /* "no copy" is valid only when at least one tx/rx mon flag is set */ if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) { mode &= ~CHMODE_MONITOR_NO_COPY; } if (mode & CHMODE_MONITOR) { if ((*err = skywalk_priv_check_cred(p, cred, PRIV_SKYWALK_OBSERVE_ALL)) != 0) { goto done; } /* Don't allow non-root processes to monitor channels. */ if (kauth_cred_issuser(cred) == 0) { *err = EPERM; goto done; } } /* * Check with the nexus to see if the port is bound; if so, prepare * our nxbind structure that we'll need to pass down to the nexus * for it compare. If the caller provides a key, we take it over * and will free it ourselves (as part of freeing nxbind.) * * If this is a monitor channel, skip this altogether since the check * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above. */ if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) { void *key = (void *)(init->ci_key); #if SK_LOG if (__improbable(sk_verbose != 0)) { ch_open_log1(p_uuid, p, port); } #endif /* SK_LOG */ nxb = nxb_alloc(Z_WAITOK); nxb->nxb_flags |= NXBF_MATCH_UNIQUEID; nxb->nxb_uniqueid = proc_uniqueid(p); nxb->nxb_pid = proc_pid(p); nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID; uuid_copy(nxb->nxb_exec_uuid, p_uuid); if (key != NULL) { nxb->nxb_flags |= NXBF_MATCH_KEY; nxb->nxb_key_len = init->ci_key_len; nxb->nxb_key = key; init->ci_key = USER_ADDR_NULL; /* take over */ } } /* * There can only be one owner of {port,ring_id} tuple. Once * owned, this can be made available among multiple monitors. * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over * all rings. Further attempts to own any or all of the rings * will be declined. * * Multiple monitors are allowed to exist. If a channel has been * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be * monitored. If an owning channel has been bound to an individual * ring, only that ring can be monitored, either by specifying the * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time. * * For example, assuming a 2-rings setup for port 'p': * * owner{p,-1} * will allow: * monitor{p,-1}, monitor{p,0}, monitor{p,1} * will not allow: * owner{p,-1}, owner{p,0}, owner{p,1} * * owner{p,0} * will allow: * owner{p,1}, monitor{p,-1}, monitor{p,0} * will not allow: * owner{p,-1}, owner{p,0}, monitor{p,1} */ if ((ch0 = ch_find(nx, port, ring)) != NULL) { SK_D("found ch0 0x%llx", SK_KVA(ch0)); /* * Unless this is a monitor channel, allow only at * most one owner of the {port,ring_id} tuple. */ if (!(mode & CHMODE_MONITOR)) { #if SK_LOG uuid_string_t uuidstr; char *na_name = (ch0->ch_na != NULL) ? ch0->ch_na->na_name : ""; SK_DSC(p, "ch %s flags (0x%x) exists on port %d on " "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port, sk_uuid_unparse(nx->nx_uuid, uuidstr), ch0->ch_name, ch0->ch_pid); #endif /* SK_LOG */ *err = EBUSY; goto done; } } else if (mode & CHMODE_MONITOR) { *err = ENXIO; goto done; } bzero(&chr, sizeof(chr)); chr.cr_tx_lowat = init->ci_tx_lowat; chr.cr_rx_lowat = init->ci_rx_lowat; chr.cr_port = port; chr.cr_mode = mode; chr.cr_ring_id = ring; /* upon success, returns a channel with reference held */ ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err); done: #if SK_LOG if (__improbable(sk_verbose != 0)) { ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err); } #endif /* SK_LOG */ if (ch0 != NULL) { (void) ch_release_locked(ch0); } if (nx != NULL) { (void) nx_release_locked(nx); } if (nxb != NULL) { nxb_free(nxb); } SK_UNLOCK(); return ch; } struct kern_channel * ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref, int *err) { struct kern_channel *ch = NULL; SK_LOCK_ASSERT_HELD(); *err = 0; ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0); ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0); ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0); ASSERT(!uuid_is_null(chr->cr_spec_uuid)); chr->cr_mode |= CHMODE_KERNEL; if (nonxref) { chr->cr_mode |= CHMODE_NO_NXREF; } else { chr->cr_mode &= ~CHMODE_NO_NXREF; } /* upon success, returns a channel with reference held */ ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err); if (ch != NULL) { /* * nonxref channels don't hold any reference to the nexus, * since otherwise we'll never be able to close them when * the last regular channel of the nexus is closed, as part * of the nexus's destructor operation. Release the nonxref * channel reference now, but make sure the nexus has at * least 3 refs: global list, provider list and the nonxref * channel itself, before doing that. */ if (nonxref) { ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF)); ASSERT(nx->nx_refcnt > 3); (void) nx_release_locked(nx); } } #if SK_LOG uuid_string_t uuidstr; SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ? ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode, CHMODE_BITS, *err); #endif /* SK_LOG */ return ch; } static void ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special) { #pragma unused(special) #if SK_LOG uuid_string_t uuidstr; const char *na_name = (ch->ch_na != NULL) ? ch->ch_na->na_name : ""; const char *nxdom_name = (ch->ch_nexus != NULL) ? NX_DOM(ch->ch_nexus)->nxdom_name : ""; const char *nxdom_prov_name = (ch->ch_nexus != NULL) ? NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : ""; SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)", SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id, uuidstr)); SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS); #endif /* SK_LOG */ struct kern_nexus *nx = ch->ch_nexus; if (!locked) { SK_LOCK(); } SK_LOCK_ASSERT_HELD(); /* * If the channel is participating in the interface advisory * notification, remove it from the nexus. * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock * is held in exclusive mode. */ lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock); if ((ch->ch_flags & CHANF_IF_ADV) != 0) { STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel, ch_link_if_adv); os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed); if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) { nx_netif_config_interface_advisory(nx, false); } lck_rw_done(&nx->nx_ch_if_adv_lock); lck_mtx_lock(&ch->ch_lock); (void) ch_release_locked(ch); } else { lck_rw_done(&nx->nx_ch_if_adv_lock); lck_mtx_lock(&ch->ch_lock); } LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); /* * Mark the channel as closing to prevent further setopt requests; * this flag is set once here and never gets cleared. */ ASSERT(!(ch->ch_flags & CHANF_CLOSING)); os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed); if (special) { VERIFY(ch->ch_flags & CHANF_KERNEL); } else { VERIFY(!(ch->ch_flags & CHANF_KERNEL)); } ch->ch_fd = -1; /* may be called as part of failure cleanup, so check */ if (ch->ch_flags & CHANF_ATTACHED) { boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF); /* caller must hold an extra ref */ ASSERT(ch->ch_refcnt > 1); /* disconnect from nexus */ ch_disconnect(ch); /* * If this was the last regular channel and the nexus * has been closed, detach it and finish up the job. * If this was a nonxref channel, there is nothing * left to do; see comments in ch_open_special(). */ if (!nonxref) { STAILQ_REMOVE(&nx->nx_ch_head, ch, kern_channel, ch_link); nx->nx_ch_count--; if (STAILQ_EMPTY(&nx->nx_ch_head) && (nx->nx_flags & NXF_CLOSED)) { ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head)); nx_detach(nx); } (void) nx_release_locked(nx); } else { ASSERT(ch->ch_flags & CHANF_KERNEL); STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch, kern_channel, ch_link); } os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed); ch->ch_nexus = NULL; (void) ch_release_locked(ch); /* for the list */ } lck_mtx_unlock(&ch->ch_lock); if (!locked) { SK_UNLOCK(); } } void ch_close(struct kern_channel *ch, boolean_t locked) { ch_close_common(ch, locked, FALSE); } void ch_close_special(struct kern_channel *ch) { ch_close_common(ch, TRUE, TRUE); } static int ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t, struct ch_ev_thresh *cet) { struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params; uint32_t bmin, bmax, smin, smax; int err = 0; if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES && cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) { err = EINVAL; goto done; } smin = 1; /* minimum 1 slot */ bmin = 1; /* minimum 1 byte */ if (t == NR_TX) { ASSERT(nxp->nxp_tx_slots > 0); smax = (nxp->nxp_tx_slots - 1); } else { ASSERT(nxp->nxp_rx_slots > 0); smax = (nxp->nxp_rx_slots - 1); } bmax = (smax * nxp->nxp_buf_size); switch (cet->cet_unit) { case CHANNEL_THRESHOLD_UNIT_BYTES: if (cet->cet_value < bmin) { cet->cet_value = bmin; } else if (cet->cet_value > bmax) { cet->cet_value = bmax; } break; case CHANNEL_THRESHOLD_UNIT_SLOTS: if (cet->cet_value < smin) { cet->cet_value = smin; } else if (cet->cet_value > smax) { cet->cet_value = smax; } break; } done: return err; } #if SK_LOG /* Hoisted out of line to reduce kernel stack footprint */ SK_LOG_ATTRIBUTE static void ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo, const struct chreq *chr, const struct kern_channel *ch, const struct kern_nexus_domain_provider *nxdom_prov, struct proc *p) { struct __user_channel_schema *ch_schema = ch->ch_schema; uuid_string_t uuidstr; unsigned int n; ring_id_t i, j; ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL)); if (ch_schema != NULL) { SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema)); SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name); SK_D(" kern_uuid: %s", sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr)); SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS); SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings, cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring); SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings, cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring); j = ch->ch_last[NR_TX]; for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) { SK_D(" tx_ring_%u_off: 0x%llx", i, (uint64_t)ch_schema->csm_ring_ofs[n].ring_off); SK_D(" tx_sd_%u_off: 0x%llx", i, (uint64_t)ch_schema->csm_ring_ofs[n].sd_off); } j = n; for (n = 0, i = ch->ch_first[NR_RX]; i < ch->ch_last[NR_RX]; n++, i++) { SK_D(" rx_ring_%u_off: 0x%llx", i, (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off); SK_D(" rx_sd_%u_off: 0x%llx", i, (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off); } SK_D(" md_type: %u", ch_schema->csm_md_type); SK_D(" md_subtype: %u", ch_schema->csm_md_subtype); SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs); SK_D(" stats_type: %u", ch_schema->csm_stats_type); SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs); SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max); SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs); } SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)", SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name, nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id); SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr)); SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr)); SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS); SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask), sk_proc_name_address(p), sk_proc_pid(p)); SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value, ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ? "bytes" : "slots")); SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value, ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ? "bytes" : "slots")); SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref)); SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base); SK_D(" mapsize: 0x%llx (%llu KB)", (uint64_t)cinfo->cinfo_mem_map_size, (uint64_t)cinfo->cinfo_mem_map_size >> 10); SK_D(" memsize: 0x%llx (%llu KB)", (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10); SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset); } SK_LOG_ATTRIBUTE static void ch_connect_log2(const struct kern_nexus *nx, int err) { uuid_string_t nx_uuidstr; SK_ERR("Error connecting to nexus UUID %s: %d", sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err); } #endif /* SK_LOG */ static struct kern_channel * ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0, struct nxbind *nxb, struct proc *p, int fd, int *err) { struct kern_nexus_domain_provider *nxdom_prov; struct kern_channel *ch = NULL; struct ch_info *cinfo = NULL; uint32_t ch_mode = chr->cr_mode; boolean_t config = FALSE; struct nxdom *nxdom; boolean_t reserved_port = FALSE; ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc); ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL)); SK_LOCK_ASSERT_HELD(); /* validate thresholds before we proceed any further */ if ((*err = ch_ev_thresh_validate(nx, NR_TX, &chr->cr_tx_lowat)) != 0 || (*err = ch_ev_thresh_validate(nx, NR_RX, &chr->cr_rx_lowat)) != 0) { goto done; } if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) { *err = ENOTSUP; goto done; } ch = ch_alloc(Z_WAITOK); lck_mtx_lock(&ch->ch_lock); uuid_generate_random(ch->ch_info->cinfo_ch_id); ch->ch_fd = fd; ch->ch_pid = proc_pid(p); (void) snprintf(ch->ch_name, sizeof(ch->ch_name), "%s", proc_name_address(p)); nxdom_prov = NX_DOM_PROV(nx); nxdom = NX_DOM(nx); if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) { /* * CHANF_KERNEL implies a channel opened by a kernel * subsystem, and is triggered by the CHMODE_KERNEL * flag which (only ever) set by ch_open_special(). * * CHANF_NONXREF can be optionally set based on the * CHMODE_NO_NXREF request flag. This must only be * set by ch_open_special() as well, hence we verify. */ ASSERT(p == kernproc); ASSERT(ch_mode & CHMODE_KERNEL); os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed); if (ch_mode & CHMODE_NO_NXREF) { os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed); } config = (ch_mode & CHMODE_CONFIG) != 0; if (chr->cr_port == NEXUS_PORT_ANY) { if (nxdom->nxdom_find_port == NULL) { *err = ENOTSUP; goto done; } /* * If ephemeral port request, find one for client; * we ask for the reserved port range if this is * a configuration request (CHMODE_CONFIG). */ if ((*err = nxdom->nxdom_find_port(nx, config, &chr->cr_port)) != 0) { goto done; } } } if (skywalk_check_platform_binary(p)) { os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed); } ASSERT(chr->cr_port != NEXUS_PORT_ANY); reserved_port = (nxdom->nxdom_port_is_reserved != NULL && (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port)); if (!config && reserved_port) { *err = EDOM; goto done; } SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p), sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port); if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov, nx, ch, chr, ch0, nxb, p)) != 0) { goto done; } cinfo = ch->ch_info; uuid_copy(cinfo->cinfo_nx_uuid, nx->nx_uuid); /* for easy access to immutables */ bcopy((void *)nx->nx_prov->nxprov_params, (void *)&cinfo->cinfo_nxprov_params, sizeof(struct nxprov_params)); cinfo->cinfo_ch_mode = ch_mode; cinfo->cinfo_ch_ring_id = chr->cr_ring_id; cinfo->cinfo_nx_port = chr->cr_port; cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr; cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize; cinfo->cinfo_schema_offset = chr->cr_memoffset; cinfo->cinfo_num_bufs = PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt; /* * ch_last is really the number of rings, but we need to return * the actual zero-based ring ID to the client. Make sure that * is the case here and adjust last_{tx,rx}_ring accordingly. */ ASSERT((ch->ch_last[NR_TX] > 0) || (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV)); ASSERT((ch->ch_last[NR_RX] > 0) || (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST)); cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX]; cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1; cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX]; cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1; cinfo->cinfo_tx_lowat = chr->cr_tx_lowat; cinfo->cinfo_rx_lowat = chr->cr_rx_lowat; if (ch_mode & CHMODE_NO_NXREF) { ASSERT(ch_mode & CHMODE_KERNEL); STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link); } else { STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link); nx->nx_ch_count++; } os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed); ch->ch_nexus = nx; nx_retain_locked(nx); /* hold a ref on the nexus */ ch_retain_locked(ch); /* one for being in the list */ ch_retain_locked(ch); /* one for the caller */ /* * Now that we've successfully created the nexus adapter, inform the * nexus provider about the rings and the slots within each ring. * This is a no-op for internal nexus providers. */ if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) { lck_mtx_unlock(&ch->ch_lock); /* gracefully close this fully-formed channel */ if (ch->ch_flags & CHANF_KERNEL) { ch_close_special(ch); } else { ch_close(ch, TRUE); } (void) ch_release_locked(ch); ch = NULL; goto done; } ASSERT(ch->ch_schema == NULL || (ch->ch_schema->csm_flags & CSM_ACTIVE)); #if SK_LOG if (__improbable(sk_verbose != 0)) { ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p); } #endif /* SK_LOG */ done: if (ch != NULL) { lck_mtx_unlock(&ch->ch_lock); } if (*err != 0) { #if SK_LOG if (__improbable(sk_verbose != 0)) { ch_connect_log2(nx, *err); } #endif /* SK_LOG */ if (ch != NULL) { ch_free(ch); ch = NULL; } } return ch; } static void ch_disconnect(struct kern_channel *ch) { struct kern_nexus *nx = ch->ch_nexus; struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx); SK_LOCK_ASSERT_HELD(); LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); /* * Inform the nexus provider that the channel has been quiesced * and disconnected from the nexus port. This is a no-op for * internal nexus providers. */ nxprov_advise_disconnect(nx, ch); /* Finally, let the domain provider tear down the instance */ nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch); } void ch_deactivate(struct kern_channel *ch) { /* * This is a trapdoor flag; once CSM_ACTIVE is cleared, * it will never be set again. Doing this will cause * os_channel_is_defunct() to indicate that the channel * is defunct and is no longer usable (thus should be * immediately closed). */ if (ch->ch_schema != NULL && (ch->ch_schema->csm_flags & CSM_ACTIVE)) { os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags), CSM_ACTIVE, relaxed); /* make this globally visible */ os_atomic_thread_fence(seq_cst); } } int ch_set_opt(struct kern_channel *ch, struct sockopt *sopt) { #pragma unused(ch) int err = 0; if (sopt->sopt_dir != SOPT_SET) { sopt->sopt_dir = SOPT_SET; } switch (sopt->sopt_name) { case CHOPT_TX_LOWAT_THRESH: err = ch_set_lowat_thresh(ch, NR_TX, sopt); break; case CHOPT_RX_LOWAT_THRESH: err = ch_set_lowat_thresh(ch, NR_RX, sopt); break; case CHOPT_IF_ADV_CONF: err = ch_configure_interface_advisory_event(ch, sopt); break; default: err = ENOPROTOOPT; break; } return err; } int ch_get_opt(struct kern_channel *ch, struct sockopt *sopt) { #pragma unused(ch) int err = 0; if (sopt->sopt_dir != SOPT_GET) { sopt->sopt_dir = SOPT_GET; } switch (sopt->sopt_name) { case CHOPT_TX_LOWAT_THRESH: err = ch_get_lowat_thresh(ch, NR_TX, sopt); break; case CHOPT_RX_LOWAT_THRESH: err = ch_get_lowat_thresh(ch, NR_RX, sopt); break; default: err = ENOPROTOOPT; break; } return err; } static int ch_configure_interface_advisory_event(struct kern_channel *ch, struct sockopt *sopt) { int err = 0; boolean_t enable = 0; struct kern_nexus *nx = ch->ch_nexus; LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); SK_LOCK_ASSERT_NOTHELD(); if (sopt->sopt_val == USER_ADDR_NULL) { return EINVAL; } if (nx->nx_adv.nxv_adv == NULL) { return ENOTSUP; } err = sooptcopyin(sopt, &enable, sizeof(enable), sizeof(enable)); if (err != 0) { return err; } /* * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock * ordering requirement; check if the channel is closing once ch_lock * is reacquired and bail if so. */ lck_mtx_unlock(&ch->ch_lock); SK_LOCK(); lck_rw_lock_exclusive(&nx->nx_ch_if_adv_lock); lck_mtx_lock(&ch->ch_lock); if (ch->ch_flags & CHANF_CLOSING) { err = ENXIO; goto done; } /* * if interface advisory reporting is enabled on the channel then * add the channel to the list of channels eligible for interface * advisory update on the nexus. If disabled, remove from the list. */ if (enable) { if ((ch->ch_flags & CHANF_IF_ADV) != 0) { ASSERT(err == 0); goto done; } bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head); os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed); STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv); if (enable_adv) { nx_netif_config_interface_advisory(nx, true); } ch_retain_locked(ch); /* for being in the IF ADV list */ } else { if ((ch->ch_flags & CHANF_IF_ADV) == 0) { ASSERT(err == 0); goto done; } STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel, ch_link_if_adv); os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed); if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) { nx_netif_config_interface_advisory(nx, false); } (void) ch_release_locked(ch); } done: lck_mtx_unlock(&ch->ch_lock); lck_rw_done(&nx->nx_ch_if_adv_lock); SK_UNLOCK(); lck_mtx_lock(&ch->ch_lock); return err; } static int ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t, struct sockopt *sopt) { struct ch_ev_thresh cet, *ocet; int err = 0; LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); if (sopt->sopt_val == USER_ADDR_NULL) { return EINVAL; } bzero(&cet, sizeof(cet)); err = sooptcopyin(sopt, &cet, sizeof(cet), sizeof(cet)); if (err == 0) { err = ch_ev_thresh_validate(ch->ch_nexus, t, &cet); if (err == 0) { if (t == NR_TX) { ocet = &ch->ch_info->cinfo_tx_lowat; } else { ocet = &ch->ch_info->cinfo_rx_lowat; } /* if there is no change, we're done */ if (ocet->cet_unit == cet.cet_unit && ocet->cet_value == cet.cet_value) { return 0; } *ocet = cet; for_rx_tx(t) { ring_id_t qfirst = ch->ch_first[t]; ring_id_t qlast = ch->ch_last[t]; uint32_t i; for (i = qfirst; i < qlast; i++) { struct __kern_channel_ring *kring = &NAKR(ch->ch_na, t)[i]; (void) kring->ckr_na_notify(kring, sopt->sopt_p, 0); } } (void) sooptcopyout(sopt, &cet, sizeof(cet)); } } return err; } static int ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t, struct sockopt *sopt) { struct ch_ev_thresh cet; LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); if (sopt->sopt_val == USER_ADDR_NULL) { return EINVAL; } if (t == NR_TX) { cet = ch->ch_info->cinfo_tx_lowat; } else { cet = ch->ch_info->cinfo_rx_lowat; } return sooptcopyout(sopt, &cet, sizeof(cet)); } static struct kern_channel * ch_alloc(zalloc_flags_t how) { struct kern_channel *ch; ch = zalloc_flags(ch_zone, how | Z_ZERO); if (ch) { lck_mtx_init(&ch->ch_lock, &channel_lock_group, &channel_lock_attr); ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO); } return ch; } static void ch_free(struct kern_channel *ch) { ASSERT(ch->ch_refcnt == 0); ASSERT(ch->ch_pp == NULL); ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT | CHANF_IF_ADV))); lck_mtx_destroy(&ch->ch_lock, &channel_lock_group); SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch)); ASSERT(ch->ch_info != NULL); zfree(ch_info_zone, ch->ch_info); ch->ch_info = NULL; zfree(ch_zone, ch); } void ch_retain_locked(struct kern_channel *ch) { SK_LOCK_ASSERT_HELD(); ch->ch_refcnt++; VERIFY(ch->ch_refcnt != 0); } void ch_retain(struct kern_channel *ch) { SK_LOCK(); ch_retain_locked(ch); SK_UNLOCK(); } int ch_release_locked(struct kern_channel *ch) { int oldref = ch->ch_refcnt; SK_LOCK_ASSERT_HELD(); VERIFY(ch->ch_refcnt != 0); if (--ch->ch_refcnt == 0) { ch_free(ch); } return oldref == 1; } int ch_release(struct kern_channel *ch) { int lastref; SK_LOCK(); lastref = ch_release_locked(ch); SK_UNLOCK(); return lastref; } void ch_dtor(void *arg) { struct kern_channel *ch = arg; SK_LOCK(); ch_close(ch, TRUE); (void) ch_release_locked(ch); SK_UNLOCK(); }