gems-kernel/source/THIRDPARTY/xnu/bsd/skywalk/nexus/nexus.c

3563 lines
88 KiB
C
Raw Normal View History

2024-06-03 16:29:39 +00:00
/*
* Copyright (c) 2015-2022 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/netif/nx_netif.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <sys/sdt.h>
static uint32_t disable_nxctl_check = 0;
#if (DEVELOPMENT || DEBUG)
SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
#endif
LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
static STAILQ_HEAD(, nxctl) nxctl_head =
STAILQ_HEAD_INITIALIZER(nxctl_head);
static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
STAILQ_HEAD_INITIALIZER(nxprov_head);
static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
RB_HEAD(kern_nexus_tree, kern_nexus);
RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
static struct kern_nexus_tree nx_head;
static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
static void nxctl_retain_locked(struct nxctl *);
static int nxctl_release_locked(struct nxctl *);
static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
static void nxctl_free(struct nxctl *);
static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
struct kern_nexus_domain_provider *, struct nxprov_reg *,
const struct kern_nexus_provider_init *init, int *);
static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
static void nxprov_retain_locked(struct kern_nexus_provider *);
static int nxprov_release_locked(struct kern_nexus_provider *);
static struct kern_nexus_provider *nxprov_alloc(
struct kern_nexus_domain_provider *, zalloc_flags_t);
static void nxprov_free(struct kern_nexus_provider *);
static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
static struct kern_nexus *nx_alloc(zalloc_flags_t);
static void nx_free(struct kern_nexus *);
static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
static int __nx_inited = 0;
#define SKMEM_TAG_NX_KEY "com.apple.skywalk.nexus.key"
SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
#define SKMEM_TAG_NX_MIB "com.apple.skywalk.nexus.mib"
static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
#define SKMEM_TAG_NX_PORT "com.apple.skywalk.nexus.port"
SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
#define SKMEM_TAG_NX_PORT_INFO "com.apple.skywalk.nexus.port.info"
SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
/*
* Special nexus controller handle for Skywalk internal use. Unlike all
* other nexus controller handles that are created by userland or kernel
* clients, this one never gets closed or freed. It is also not part of
* the global nxctl_head list.
*/
static struct nxctl _kernnxctl;
static struct nxctl _usernxctl;
struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
int
nexus_init(void)
{
SK_LOCK_ASSERT_HELD();
ASSERT(!__nx_inited);
RB_INIT(&nx_head);
na_init();
/* attach system built-in domains and domain providers */
nxdom_attach_all();
/*
* Initialize private kernel and shared user nexus controller handle;
*
* Shared Kernel controller is used internally for creating nexus providers
* and nexus instances from within the Skywalk code (e.g. netif_compat).
*
* Shared User controller is used userspace by clients(e.g. libnetcore)
* that would like to call nexus instances for use cases like
* configuring flow entry that they own indirectly (e.g. via NECP), so
* that the nexus would perform permission check based on other info
* (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
* credentials).
*/
nxctl_init(&_kernnxctl, kernproc, NULL);
nxctl_retain_locked(&_kernnxctl); /* one for us */
nxctl_init(&_usernxctl, kernproc, NULL);
nxctl_retain_locked(&_usernxctl); /* one for us */
nxctl_traffic_rule_init();
__nx_inited = 1;
return 0;
}
void
nexus_fini(void)
{
SK_LOCK_ASSERT_HELD();
if (__nx_inited) {
nxctl_traffic_rule_fini();
nxctl_release_locked(&_kernnxctl);
nxctl_release_locked(&_usernxctl);
/* tell all domains they're going away */
nxdom_detach_all();
ASSERT(RB_EMPTY(&nx_head));
na_fini();
__nx_inited = 0;
}
}
struct nxctl *
nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
int *err)
{
struct nxctl *nxctl = NULL;
ASSERT(!uuid_is_null(nxctl_uuid));
/* privilege checks would be done when performing nxctl operations */
SK_LOCK();
nxctl = nxctl_alloc(p, fp, Z_WAITOK);
STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
uuid_copy(nxctl->nxctl_uuid, nxctl_uuid);
nxctl_retain_locked(nxctl); /* one for being in the list */
nxctl_retain_locked(nxctl); /* one for the caller */
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
#endif /* SK_LOG */
SK_UNLOCK();
if (*err != 0) {
nxctl_free(nxctl);
nxctl = NULL;
}
return nxctl;
}
void
nxctl_close(struct nxctl *nxctl)
{
struct kern_nexus_provider *nxprov = NULL, *tnxprov;
lck_mtx_lock(&nxctl->nxctl_lock);
SK_LOCK();
ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
nxctl->nxctl_flags, NEXUSCTLF_BITS);
#endif /* SK_LOG */
if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
nxctl->nxctl_fp = NULL;
}
/* may be called as part of failure cleanup, so check */
if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
/* caller must hold an extra ref */
ASSERT(nxctl->nxctl_refcnt > 1);
(void) nxctl_release_locked(nxctl);
STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
}
repeat:
STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
/*
* Close provider only for those which are owned by
* this control instance. Note that if we close the
* provider, we need to repeat this search as the
* list might have been changed by another thread.
* That's possible since SK_UNLOCK() may be called
* as a result of calling nxprov_close().
*/
if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
nxprov->nxprov_ctl == nxctl) {
nxprov_retain_locked(nxprov);
(void) nxprov_close(nxprov, TRUE);
(void) nxprov_release_locked(nxprov);
goto repeat;
}
}
SK_UNLOCK();
lck_mtx_unlock(&nxctl->nxctl_lock);
nxctl_traffic_rule_clean(nxctl);
}
int
nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
{
#pragma unused(nxctl)
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
if (sopt->sopt_dir != SOPT_SET) {
sopt->sopt_dir = SOPT_SET;
}
switch (sopt->sopt_name) {
case NXOPT_NEXUS_BIND:
err = nxctl_nexus_bind(nxctl, sopt);
break;
case NXOPT_NEXUS_UNBIND:
err = nxctl_nexus_unbind(nxctl, sopt);
break;
case NXOPT_NEXUS_CONFIG:
err = nxctl_nexus_config(nxctl, sopt);
break;
default:
err = ENOPROTOOPT;
break;
}
return err;
}
int
nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
{
#pragma unused(nxctl)
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
if (sopt->sopt_dir != SOPT_GET) {
sopt->sopt_dir = SOPT_GET;
}
switch (sopt->sopt_name) {
case NXOPT_NEXUS_PROV_LIST:
err = nxctl_get_nexus_prov_list(nxctl, sopt);
break;
case NXOPT_NEXUS_PROV_ENTRY:
err = nxctl_get_nexus_prov_entry(nxctl, sopt);
break;
case NXOPT_NEXUS_LIST:
err = nxctl_get_nexus_list(nxctl, sopt);
break;
case NXOPT_CHANNEL_LIST:
err = nxctl_get_channel_list(nxctl, sopt);
break;
default:
err = ENOPROTOOPT;
break;
}
return err;
}
/* Upper bound on # of nrl_num_regs that we'd return to user space */
#define MAX_NUM_REG_ENTRIES 256
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
{
user_addr_t tmp_ptr = USER_ADDR_NULL;
struct nxprov_reg_ent *pnre, *nres = NULL;
struct nxprov_list_req nrlr;
struct kern_nexus_provider *nxprov = NULL;
uint32_t nregs = 0, ncregs = 0;
int err = 0, observeall;
size_t nres_sz;
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(sopt->sopt_p != NULL);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
err = sooptcopyin(sopt, &nrlr, sizeof(nrlr), sizeof(nrlr));
if (err != 0) {
return err;
}
if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
}
/*
* If the caller specified a buffer, copy out the Nexus provider
* entries to caller gracefully. We only copy out the number of
* entries which caller has asked for, but we always tell caller
* how big the buffer really needs to be.
*/
tmp_ptr = nrlr.nrl_regs;
if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
if (__improbable(nres == NULL)) {
return ENOBUFS;
}
}
observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
PRIV_SKYWALK_OBSERVE_ALL) == 0);
SK_LOCK();
/*
* Count number of providers. If buffer space exists and
* remains, copy out provider entries.
*/
nregs = nrlr.nrl_num_regs;
pnre = nres;
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
/*
* Return only entries that are visible to the caller,
* unless it has PRIV_SKYWALK_OBSERVE_ALL.
*/
if (nxprov->nxprov_ctl != nxctl && !observeall) {
continue;
}
if (nres != NULL && nregs > 0) {
uuid_copy(pnre->npre_prov_uuid, nxprov->nxprov_uuid);
bcopy(nxprov->nxprov_params, &pnre->npre_prov_params,
sizeof(struct nxprov_params));
--nregs;
++pnre;
++ncregs;
}
}
SK_UNLOCK();
if (ncregs == 0) {
err = ENOENT;
}
if (nres != NULL) {
if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
if (sopt->sopt_p != kernproc) {
err = copyout(nres, tmp_ptr,
ncregs * sizeof(*nres));
} else {
bcopy(nres, CAST_DOWN(caddr_t, tmp_ptr),
ncregs * sizeof(*nres));
}
}
sk_free_data(nres, nres_sz);
nres = NULL;
}
if (err == 0) {
nrlr.nrl_num_regs = ncregs;
err = sooptcopyout(sopt, &nrlr, sizeof(nrlr));
}
return err;
}
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
{
struct nxprov_reg_ent nre;
struct kern_nexus_provider *nxprov = NULL;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(sopt->sopt_p != NULL);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
bzero(&nre, sizeof(nre));
err = sooptcopyin(sopt, &nre, sizeof(nre), sizeof(nre));
if (err != 0) {
return err;
}
if (uuid_is_null(nre.npre_prov_uuid)) {
return EINVAL;
}
SK_LOCK();
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
if (uuid_compare(nxprov->nxprov_uuid,
nre.npre_prov_uuid) == 0) {
/*
* Return only entries that are visible to the caller,
* unless it has PRIV_SKYWALK_OBSERVE_ALL.
*/
if (nxprov->nxprov_ctl != nxctl) {
if (skywalk_priv_check_cred(sopt->sopt_p,
nxctl->nxctl_cred,
PRIV_SKYWALK_OBSERVE_ALL) != 0) {
nxprov = NULL;
break;
}
}
bcopy(nxprov->nxprov_params, &nre.npre_prov_params,
sizeof(struct nxprov_params));
break;
}
}
SK_UNLOCK();
if (nxprov != NULL) {
err = sooptcopyout(sopt, &nre, sizeof(nre));
} else {
err = ENOENT;
}
return err;
}
/* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
#define MAX_NUM_NX_UUIDS 4096
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
{
user_addr_t tmp_ptr = USER_ADDR_NULL;
uint32_t nuuids = 0, ncuuids = 0;
uuid_t *puuid, *uuids = NULL;
size_t uuids_sz;
struct nx_list_req nlr;
struct kern_nexus_provider *nxprov = NULL;
struct kern_nexus *nx = NULL;
int err = 0, observeall;
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(sopt->sopt_p != NULL);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
err = sooptcopyin(sopt, &nlr, sizeof(nlr), sizeof(nlr));
if (err != 0) {
return err;
}
if (uuid_is_null(nlr.nl_prov_uuid)) {
return EINVAL;
} else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
}
/*
* If the caller specified a buffer, copy out the Nexus UUIDs to
* caller gracefully. We only copy out the number of UUIDs which
* caller has asked for, but we always tell caller how big the
* buffer really needs to be.
*/
tmp_ptr = nlr.nl_nx_uuids;
if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
if (__improbable(uuids == NULL)) {
return ENOBUFS;
}
}
observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
PRIV_SKYWALK_OBSERVE_ALL) == 0);
SK_LOCK();
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
/*
* Return only entries that are visible to the caller,
* unless it has PRIV_SKYWALK_OBSERVE_ALL.
*/
if (nxprov->nxprov_ctl != nxctl && !observeall) {
continue;
}
if (uuid_compare(nxprov->nxprov_uuid, nlr.nl_prov_uuid) == 0) {
break;
}
}
if (nxprov != NULL) {
/*
* Count number of Nexus. If buffer space exists
* and remains, copy out the Nexus UUIDs.
*/
nuuids = nlr.nl_num_nx_uuids;
puuid = uuids;
STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
++ncuuids;
if (uuids != NULL && nuuids > 0) {
uuid_copy(*puuid, nx->nx_uuid);
--nuuids;
++puuid;
}
}
} else {
err = ENOENT;
}
SK_UNLOCK();
if (uuids != NULL) {
if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
uintptr_t cnt_uuid;
/* Note: Pointer arithmetic */
cnt_uuid = (uintptr_t)(puuid - uuids);
if (cnt_uuid > 0) {
if (sopt->sopt_p != kernproc) {
err = copyout(uuids, tmp_ptr,
cnt_uuid * sizeof(uuid_t));
} else {
bcopy(uuids,
CAST_DOWN(caddr_t, tmp_ptr),
cnt_uuid * sizeof(uuid_t));
}
}
}
sk_free_data(uuids, uuids_sz);
uuids = NULL;
}
if (err == 0) {
nlr.nl_num_nx_uuids = ncuuids;
err = sooptcopyout(sopt, &nlr, sizeof(nlr));
}
return err;
}
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
{
boolean_t m_pid, m_exec_uuid, m_key;
struct nx_bind_req nbr;
struct proc *p = PROC_NULL;
struct nxbind *nxb = NULL;
uint64_t p_uniqueid = -1;
pid_t p_pid = -1;
struct kern_nexus *nx = NULL;
#if SK_LOG
uuid_string_t exec_uuidstr;
#endif /* SK_LOG */
uuid_t p_uuid;
void *key = NULL;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
uuid_clear(p_uuid);
bzero(&nbr, sizeof(nbr));
err = sooptcopyin(sopt, &nbr, sizeof(nbr), sizeof(nbr));
if (err != 0) {
return err;
}
if (uuid_is_null(nbr.nb_nx_uuid)) {
err = EINVAL;
goto done_unlocked;
}
nbr.nb_flags &= NBR_MATCH_MASK;
if (nbr.nb_flags == 0) {
/* must choose one of the match criteria */
err = EINVAL;
goto done_unlocked;
}
m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
if (m_pid || m_exec_uuid) {
/*
* Validate process ID. A valid PID is needed when we're
* asked to match by PID, or if asked to match by executable
* UUID with a NULL nb_exec_uuid supplied. The latter is
* to support the case when a userland Nexus provider isn't
* able to acquire its client's executable UUID, but is
* able to identify it via PID.
*/
if ((m_pid || uuid_is_null(nbr.nb_exec_uuid)) &&
(p = proc_find(nbr.nb_pid)) == PROC_NULL) {
err = ESRCH;
goto done_unlocked;
}
/* exclude kernel from the match criteria */
if (p == kernproc) {
err = EACCES;
goto done_unlocked;
} else if (p != PROC_NULL) {
proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
p_uniqueid = proc_uniqueid(p);
p_pid = proc_pid(p);
} else {
uuid_copy(p_uuid, nbr.nb_exec_uuid);
}
}
if (m_key) {
if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
nbr.nb_key == USER_ADDR_NULL) {
err = EINVAL;
goto done_unlocked;
}
key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
if (__improbable(key == NULL)) {
err = ENOMEM;
goto done_unlocked;
}
if (sopt->sopt_p != kernproc) {
err = copyin(nbr.nb_key, key, nbr.nb_key_len);
if (err != 0) {
goto done_unlocked;
}
} else {
bcopy((void *)nbr.nb_key, key, nbr.nb_key_len);
}
}
SK_LOCK();
nx = nx_find(nbr.nb_nx_uuid, TRUE);
if (nx == NULL || (disable_nxctl_check == 0 &&
nx->nx_prov->nxprov_ctl != nxctl &&
nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
err = ENOENT;
goto done;
}
/* bind isn't applicable on anonymous nexus provider */
if (NX_ANONYMOUS_PROV(nx)) {
err = ENXIO;
goto done;
}
/* port must be within the domain's range */
if (nbr.nb_port != NEXUS_PORT_ANY &&
nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
err = EDOM;
goto done;
} else if (nbr.nb_port == NEXUS_PORT_ANY) {
/* for now, this is allowed only for kernel clients */
if (sopt->sopt_p != kernproc) {
err = EPERM;
goto done;
}
}
nxb = nxb_alloc(Z_WAITOK);
if (m_pid) {
nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
nxb->nxb_uniqueid = p_uniqueid;
nxb->nxb_pid = p_pid;
}
if (m_exec_uuid) {
nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
ASSERT(!uuid_is_null(p_uuid));
uuid_copy(nxb->nxb_exec_uuid, p_uuid);
}
if (m_key) {
nxb->nxb_flags |= NXBF_MATCH_KEY;
ASSERT(key != NULL);
nxb->nxb_key = key;
key = NULL; /* let nxb_free() free it */
ASSERT(nbr.nb_key_len != 0 &&
nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
nxb->nxb_key_len = nbr.nb_key_len;
}
/*
* Bind the creds to the nexus port. If client doesn't have a port,
* find one, claim it, and associate the creds to it. Upon success,
* the nexus may move the nxbind contents (including the key) to
* its own nxbind instance; in that case, nxb_free() below will not
* be freeing the key within.
*/
err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
if (err != 0) {
goto done;
}
ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
(void) sooptcopyout(sopt, &nbr, sizeof(nbr));
SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
"(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
(nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
nxb->nxb_key_len);
done:
if (nx != NULL) {
(void) nx_release_locked(nx);
nx = NULL;
}
SK_UNLOCK();
done_unlocked:
ASSERT(nx == NULL);
if (nxb != NULL) {
nxb_free(nxb);
nxb = NULL;
}
if (key != NULL) {
sk_free_data(key, nbr.nb_key_len);
key = NULL;
}
if (p != PROC_NULL) {
proc_rele(p);
}
return err;
}
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
{
struct nx_unbind_req nur;
struct kern_nexus *nx = NULL;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
bzero(&nur, sizeof(nur));
err = sooptcopyin(sopt, &nur, sizeof(nur), sizeof(nur));
if (err != 0) {
return err;
}
if (uuid_is_null(nur.nu_nx_uuid)) {
return EINVAL;
}
SK_LOCK();
nx = nx_find(nur.nu_nx_uuid, TRUE);
if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
err = ENOENT;
goto done;
}
/* unbind isn't applicable on anonymous nexus provider */
if (NX_ANONYMOUS_PROV(nx)) {
err = ENXIO;
goto done;
}
if (nur.nu_port == NEXUS_PORT_ANY) {
err = EINVAL;
goto done;
}
err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
done:
if (nx != NULL) {
(void) nx_release_locked(nx);
nx = NULL;
}
SK_UNLOCK();
return err;
}
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
{
struct kern_nexus *nx = NULL;
struct nx_cfg_req ncr;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
bzero(&ncr, sizeof(ncr));
err = sooptcopyin(sopt, &ncr, sizeof(ncr), sizeof(ncr));
if (err != 0) {
return err;
}
if (uuid_is_null(ncr.nc_nx_uuid)) {
return EINVAL;
}
SK_LOCK();
nx = nx_find(ncr.nc_nx_uuid, TRUE);
if (nx == NULL || (disable_nxctl_check == 0 &&
nx->nx_prov->nxprov_ctl != nxctl &&
nxctl != &_kernnxctl && /* allow kernel/shared user nxctl */
nxctl != &_usernxctl)) {
err = ENOENT;
goto done;
}
if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
} else {
err = EPERM;
}
if (err == 0) {
(void) sooptcopyout(sopt, &ncr, sizeof(ncr));
}
done:
if (nx != NULL) {
(void) nx_release_locked(nx);
nx = NULL;
}
SK_UNLOCK();
return err;
}
struct nxbind *
nxb_alloc(zalloc_flags_t how)
{
struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
if (nxb) {
SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
}
return nxb;
}
void
nxb_free(struct nxbind *nxb)
{
SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
(nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
if (nxb->nxb_key != NULL) {
sk_free_data(nxb->nxb_key, nxb->nxb_key_len);
nxb->nxb_key = NULL;
}
zfree(nxbind_zone, nxb);
}
/*
* nxb0 is assumed to possess the truth, compare nxb1 against it.
*/
boolean_t
nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
{
ASSERT(nxb0 != NULL && nxb1 != NULL);
ASSERT(nxb0 != nxb1);
/* we always compare using uniqueid and not pid */
if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
return FALSE;
}
if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
uuid_compare(nxb1->nxb_exec_uuid, nxb0->nxb_exec_uuid) != 0) {
return FALSE;
}
ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
(nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
(nxb0->nxb_key_len != nxb1->nxb_key_len ||
nxb1->nxb_key == NULL || timingsafe_bcmp(nxb1->nxb_key, nxb0->nxb_key,
nxb1->nxb_key_len) != 0)) {
return FALSE;
}
return TRUE;
}
void
nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
{
ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
(snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
/* in case the destination has a key attached, free it first */
if (dnxb->nxb_key != NULL) {
sk_free_data(dnxb->nxb_key, dnxb->nxb_key_len);
dnxb->nxb_key = NULL;
}
/* move everything from src to dst, and then wipe out src */
bcopy(snxb, dnxb, sizeof(*dnxb));
bzero(snxb, sizeof(*snxb));
}
/* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
#define MAX_NUM_CH_UUIDS 4096
/* Hoisted out of line to reduce kernel stack footprint */
SK_NO_INLINE_ATTRIBUTE
static int
nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
{
user_addr_t tmp_ptr = USER_ADDR_NULL;
uint32_t nuuids = 0, ncuuids = 0;
uuid_t *puuid, *uuids = NULL;
size_t uuids_sz;
struct ch_list_req clr;
struct kern_channel *ch = NULL;
struct kern_nexus *nx = NULL;
struct kern_nexus find;
int err = 0, observeall;
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(sopt->sopt_p != NULL);
if (sopt->sopt_val == USER_ADDR_NULL) {
return EINVAL;
}
err = sooptcopyin(sopt, &clr, sizeof(clr), sizeof(clr));
if (err != 0) {
return err;
}
if (uuid_is_null(clr.cl_nx_uuid)) {
return EINVAL;
} else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
}
/*
* If the caller specified a buffer, copy out the Channel UUIDs to
* caller gracefully. We only copy out the number of UUIDs which
* caller has asked for, but we always tell caller how big the
* buffer really needs to be.
*/
tmp_ptr = clr.cl_ch_uuids;
if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
if (uuids == NULL) {
return ENOBUFS;
}
}
observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
PRIV_SKYWALK_OBSERVE_ALL) == 0);
SK_LOCK();
uuid_copy(find.nx_uuid, clr.cl_nx_uuid);
nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
/*
* Return only entries that are visible to the caller,
* unless it has PRIV_SKYWALK_OBSERVE_ALL.
*/
nx = NULL;
}
if (nx != NULL) {
/*
* Count number of Channels. If buffer space exists
* and remains, copy out the Channel UUIDs.
*/
nuuids = clr.cl_num_ch_uuids;
puuid = uuids;
STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
++ncuuids;
if (uuids != NULL && nuuids > 0) {
uuid_copy(*puuid, ch->ch_info->cinfo_ch_id);
--nuuids;
++puuid;
}
}
} else {
err = ENOENT;
}
SK_UNLOCK();
if (uuids != NULL) {
if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
uintptr_t cnt_uuid;
/* Note: Pointer arithmetic */
cnt_uuid = (uintptr_t)(puuid - uuids);
ASSERT(cnt_uuid > 0);
if (sopt->sopt_p != kernproc) {
err = copyout(uuids, tmp_ptr,
cnt_uuid * sizeof(uuid_t));
} else {
bcopy(uuids, CAST_DOWN(caddr_t, tmp_ptr),
cnt_uuid * sizeof(uuid_t));
}
}
sk_free_data(uuids, uuids_sz);
uuids = NULL;
}
if (err == 0) {
clr.cl_num_ch_uuids = ncuuids;
err = sooptcopyout(sopt, &clr, sizeof(clr));
}
return err;
}
static void
nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
{
uuid_t p_uuid;
bzero(nxctl, sizeof(*nxctl));
proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
lck_mtx_init(&nxctl->nxctl_lock, &nexus_lock_group, &nexus_lock_attr);
uuid_copy(nxctl->nxctl_proc_uuid, p_uuid);
nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
nxctl->nxctl_cred = kauth_cred_proc_ref(p);
nxctl->nxctl_fp = fp;
if (nxctl == &_kernnxctl) {
ASSERT(p == kernproc);
nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
}
if (nxctl == &_usernxctl) {
ASSERT(p == kernproc);
nxctl->nxctl_cred = NULL;
}
if (fp == NULL) {
nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
}
}
static struct nxctl *
nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
{
struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
if (nxctl != NULL) {
nxctl_init(nxctl, p, fp);
}
return nxctl;
}
static void
nxctl_free(struct nxctl *nxctl)
{
ASSERT(nxctl->nxctl_refcnt == 0);
ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
kauth_cred_unref(&nxctl->nxctl_cred);
lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group);
SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
zfree(nxctl_zone, nxctl);
}
}
static void
nxctl_retain_locked(struct nxctl *nxctl)
{
SK_LOCK_ASSERT_HELD();
nxctl->nxctl_refcnt++;
ASSERT(nxctl->nxctl_refcnt != 0);
}
void
nxctl_retain(struct nxctl *nxctl)
{
SK_LOCK();
nxctl_retain_locked(nxctl);
SK_UNLOCK();
}
static int
nxctl_release_locked(struct nxctl *nxctl)
{
int oldref = nxctl->nxctl_refcnt;
SK_LOCK_ASSERT_HELD();
ASSERT(nxctl->nxctl_refcnt != 0);
if (--nxctl->nxctl_refcnt == 0) {
nxctl_free(nxctl);
}
return oldref == 1;
}
int
nxctl_release(struct nxctl *nxctl)
{
int lastref;
SK_LOCK();
lastref = nxctl_release_locked(nxctl);
SK_UNLOCK();
return lastref;
}
void
nxctl_dtor(void *arg)
{
struct nxctl *nxctl = arg;
nxctl_close(nxctl);
SK_LOCK();
(void) nxctl_release_locked(nxctl);
SK_UNLOCK();
}
int
nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
struct proc *p)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
int err = 0;
ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
ASSERT(ch->ch_ctx == NULL);
SK_LOCK_ASSERT_HELD();
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
/* monitor channels aren't externally visible/usable, so ignore */
if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
(ch->ch_flags & CHANF_EXT_SKIP) ||
(nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
nxprov->nxprov_ext.nxpi_connected == NULL)) {
return 0;
}
ch_retain_locked(ch);
lck_mtx_unlock(&ch->ch_lock);
SK_UNLOCK();
lck_mtx_lock(&ch->ch_lock);
err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
if (err != 0) {
SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
"error %d", SK_KVA(ch), ch->ch_flags,
CHANF_BITS, SK_KVA(nx), err);
ch->ch_ctx = NULL;
goto done;
}
/*
* Upon ring/slot init failure, this is cleared
* by nxprov_advise_disconnect() below.
*/
os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
if (NXPROV_LLINK(nxprov)) {
err = nx_netif_llink_ext_init_default_queues(nx);
} else {
err = nx_init_rings(nx, ch);
}
if (err != 0) {
goto done;
}
ASSERT(err == 0);
ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
if (err != 0) {
SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
goto done;
}
os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
SK_D("ch 0x%llx flags %b nx 0x%llx connected",
SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
done:
lck_mtx_unlock(&ch->ch_lock);
SK_LOCK();
lck_mtx_lock(&ch->ch_lock);
if ((err != 0) &&
(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
nxprov_advise_disconnect(nx, ch);
}
/* caller is expected to hold one, in addition to ourselves */
VERIFY(ch->ch_refcnt >= 2);
ch_release_locked(ch);
return err;
}
void
nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
SK_LOCK_ASSERT_HELD();
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
/* check as we might be called in the error handling path */
if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
ch_retain_locked(ch);
lck_mtx_unlock(&ch->ch_lock);
SK_UNLOCK();
lck_mtx_lock(&ch->ch_lock);
ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
if (ch->ch_flags & CHANF_EXT_CONNECTED) {
nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
}
/*
* Inform the external domain provider that the rings
* and slots for this channel are no longer valid.
*/
if (NXPROV_LLINK(nxprov)) {
nx_netif_llink_ext_fini_default_queues(nx);
} else {
nx_fini_rings(nx, ch);
}
ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
/* We're done with this channel */
ch->ch_ctx = NULL;
lck_mtx_unlock(&ch->ch_lock);
SK_LOCK();
lck_mtx_lock(&ch->ch_lock);
/* caller is expected to hold one, in addition to ourselves */
VERIFY(ch->ch_refcnt >= 2);
ch_release_locked(ch);
}
ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
ASSERT(ch->ch_ctx == NULL);
}
static struct kern_nexus_provider *
nxprov_create_common(struct nxctl *nxctl,
struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
const struct kern_nexus_provider_init *init, int *err)
{
struct skmem_region_params srp[SKMEM_REGIONS];
struct kern_nexus_provider *nxprov = NULL;
struct nxprov_params nxp;
uint32_t override = 0;
uint32_t pp_region_config_flags;
int i;
_CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
_CASSERT(sizeof(*init) >=
sizeof(struct kern_nexus_netif_provider_init));
SK_LOCK_ASSERT_HELD();
ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
PP_REGION_CONFIG_BUF_IODIR_BIDIR;
/*
* Special handling for external nexus providers; similar
* logic to what's done in kern_pbufpool_create().
*/
if (init != NULL) {
if (init->nxpi_flags & NXPIF_MONOLITHIC) {
pp_region_config_flags |=
PP_REGION_CONFIG_BUF_MONOLITHIC;
}
if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
pp_region_config_flags |=
PP_REGION_CONFIG_BUF_NOCACHE;
}
}
/*
* For network devices, set the packet metadata memory as persistent
* so that it is wired at segment creation. This allows us to access
* it with preemption disabled, as well as for rdar://problem/46511741.
*/
if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
}
/* process and validate provider parameters */
if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
&nxp, srp, override, pp_region_config_flags)) != 0) {
goto done;
}
nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
STAILQ_INIT(&nxprov->nxprov_nx_head);
STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
nxprov->nxprov_flags |= NXPROVF_ATTACHED;
nxprov->nxprov_ctl = nxctl;
uuid_generate_random(nxprov->nxprov_uuid);
bcopy(&nxp, nxprov->nxprov_params, sizeof(struct nxprov_params));
if (init != NULL) {
if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
ASSERT(NXPROV_LLINK(nxprov));
bcopy(init, &nxprov->nxprov_netif_ext,
sizeof(nxprov->nxprov_netif_ext));
} else {
ASSERT(!NXPROV_LLINK(nxprov));
ASSERT(init->nxpi_version ==
KERN_NEXUS_PROVIDER_CURRENT_VERSION);
bcopy(init, &nxprov->nxprov_ext, sizeof(*init));
}
nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
}
/* store validated region parameters to the provider */
for (i = 0; i < SKMEM_REGIONS; i++) {
nxprov->nxprov_region_params[i] = srp[i];
}
if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
}
} else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
NEXUS_TYPE_NET_IF) {
/*
* Treat non-netif built-in nexus providers as those
* meant for inter-process communications, i.e. there
* is no actual networking hardware involved.
*/
nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
}
nxprov_retain_locked(nxprov); /* one for being in the list */
nxprov_retain_locked(nxprov); /* one for the caller */
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
#endif /* SK_LOG */
done:
return nxprov;
}
struct kern_nexus_provider *
nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
int *err)
{
struct nxprov_params *nxp = &reg->nxpreg_params;
struct kern_nexus_domain_provider *nxdom_prov = NULL;
struct kern_nexus_provider *nxprov = NULL;
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
*err = 0;
switch (nxp->nxp_type) {
case NEXUS_TYPE_USER_PIPE: /* only for userland */
*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
PRIV_SKYWALK_REGISTER_USER_PIPE);
break;
case NEXUS_TYPE_FLOW_SWITCH: /* allowed for userland */
*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
break;
case NEXUS_TYPE_NET_IF: /* allowed for userland */
*err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
PRIV_SKYWALK_REGISTER_NET_IF);
break;
case NEXUS_TYPE_KERNEL_PIPE: /* only for kernel */
case NEXUS_TYPE_MONITOR: /* invalid */
default:
*err = EINVAL;
goto done;
}
if (*err != 0) {
goto done;
}
ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
*err = ENXIO;
goto done;
}
#if CONFIG_NEXUS_NETIF
/* make sure netif_compat is the default here */
ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
strcmp(nxdom_prov->nxdom_prov_name,
NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
#endif /* CONFIG_NEXUS_NETIF */
SK_LOCK();
/* callee holds a reference for our caller upon success */
nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
SK_UNLOCK();
done:
return nxprov;
}
struct kern_nexus_provider *
nxprov_create_kern(struct nxctl *nxctl,
struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
const struct kern_nexus_provider_init *init, int *err)
{
struct nxprov_params *nxp = &reg->nxpreg_params;
struct kern_nexus_provider *nxprov = NULL;
NXCTL_LOCK_ASSERT_HELD(nxctl);
SK_LOCK_ASSERT_HELD();
ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
ASSERT(init == NULL ||
init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
*err = 0;
switch (nxp->nxp_type) {
case NEXUS_TYPE_NET_IF:
break;
case NEXUS_TYPE_KERNEL_PIPE:
if (init == NULL) {
*err = EINVAL;
goto done;
}
break;
case NEXUS_TYPE_FLOW_SWITCH:
if (init != NULL) {
*err = EINVAL;
goto done;
}
break;
case NEXUS_TYPE_USER_PIPE: /* only for userland */
case NEXUS_TYPE_MONITOR: /* invalid */
default:
*err = EINVAL;
goto done;
}
/* callee holds a reference for our caller upon success */
nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
done:
return nxprov;
}
int
nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
{
struct kern_nexus_provider *nxprov = NULL;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
SK_LOCK();
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
if (nxctl == nxprov->nxprov_ctl &&
uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
nxprov_retain_locked(nxprov);
break;
}
}
if (nxprov == NULL) {
err = ENOENT;
} else {
err = nxprov_close(nxprov, TRUE);
}
if (nxprov != NULL) {
(void) nxprov_release_locked(nxprov);
}
SK_UNLOCK();
return err;
}
int
nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
{
int err = 0;
if (!locked) {
SK_LOCK();
}
SK_LOCK_ASSERT_HELD();
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
nxprov->nxprov_flags, NXPROVF_BITS);
#endif /* SK_LOG */
if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
err = EALREADY;
} else {
struct kern_nexus *nx, *tnx;
nxprov->nxprov_ctl = NULL;
STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
nx_prov_link, tnx) {
nx_retain_locked(nx);
(void) nx_close(nx, TRUE);
(void) nx_release_locked(nx);
}
if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
/* no nexus created on this, so detach now */
nxprov_detach(nxprov, TRUE);
} else {
/* detach when last nexus is destroyed */
ASSERT(nxprov->nxprov_refcnt > 1);
nxprov->nxprov_flags |= NXPROVF_CLOSED;
}
}
if (!locked) {
SK_UNLOCK();
}
return err;
}
static void
nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
{
if (!locked) {
SK_LOCK();
}
SK_LOCK_ASSERT_HELD();
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
nxprov->nxprov_flags, NXPROVF_BITS);
#endif /* SK_LOG */
ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
/* caller must hold an extra ref */
ASSERT(nxprov->nxprov_refcnt > 1);
(void) nxprov_release_locked(nxprov);
if (!locked) {
SK_UNLOCK();
}
}
static struct kern_nexus_provider *
nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
{
struct kern_nexus_provider *nxprov;
struct nxprov_params *nxp;
ASSERT(nxdom_prov != NULL);
nxp = nxprov_params_alloc(how);
if (nxp == NULL) {
SK_ERR("Failed to allocate nxprov_params");
return NULL;
}
nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
if (nxprov == NULL) {
SK_ERR("Failed to allocate nxprov");
nxprov_params_free(nxp);
return NULL;
}
nxprov->nxprov_dom_prov = nxdom_prov;
nxprov->nxprov_params = nxp;
/* hold a reference for nxprov */
nxdom_prov_retain_locked(nxdom_prov);
return nxprov;
}
static void
nxprov_free(struct kern_nexus_provider *nxprov)
{
struct kern_nexus_domain_provider *nxdom_prov =
nxprov->nxprov_dom_prov;
SK_LOCK_ASSERT_HELD();
ASSERT(nxdom_prov != NULL);
(void) nxdom_prov_release_locked(nxdom_prov);
nxprov->nxprov_dom_prov = NULL;
ASSERT(nxprov->nxprov_params != NULL);
nxprov_params_free(nxprov->nxprov_params);
nxprov->nxprov_params = NULL;
ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
zfree(nxprov_zone, nxprov);
}
static void
nxprov_retain_locked(struct kern_nexus_provider *nxprov)
{
SK_LOCK_ASSERT_HELD();
nxprov->nxprov_refcnt++;
ASSERT(nxprov->nxprov_refcnt != 0);
}
void
nxprov_retain(struct kern_nexus_provider *nxprov)
{
SK_LOCK();
nxprov_retain_locked(nxprov);
SK_UNLOCK();
}
static int
nxprov_release_locked(struct kern_nexus_provider *nxprov)
{
int oldref = nxprov->nxprov_refcnt;
SK_LOCK_ASSERT_HELD();
ASSERT(nxprov->nxprov_refcnt != 0);
if (--nxprov->nxprov_refcnt == 0) {
nxprov_free(nxprov);
}
return oldref == 1;
}
int
nxprov_release(struct kern_nexus_provider *nxprov)
{
int lastref;
SK_LOCK();
lastref = nxprov_release_locked(nxprov);
SK_UNLOCK();
return lastref;
}
struct nxprov_params *
nxprov_params_alloc(zalloc_flags_t how)
{
return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
}
void
nxprov_params_free(struct nxprov_params *nxp)
{
SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
zfree(nxprov_params_zone, nxp);
}
static int
nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
{
struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
return ENOTSUP;
}
/*
* Require that the nexus domain metadata type and the
* metadata type of the caller-provided pbufpool match.
*/
if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
pp->pp_md_type ||
nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
pp->pp_md_subtype) {
SK_ERR("Mismatch in metadata type/subtype "
"(%u/%u != %u/%u)", pp->pp_md_type,
nxdom_prov->nxdom_prov_dom->nxdom_md_type,
pp->pp_md_subtype,
nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
return EINVAL;
}
/*
* Require that the nexus provider memory configuration
* has the same impedance as the caller-provided one.
* Both need to be lacking or present; if one of them
* is set and the other isn't, then we bail.
*/
if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
!!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
SK_ERR("Memory config mismatch: monolithic mode");
return EINVAL;
}
return 0;
}
struct kern_nexus *
nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
const nexus_type_t dom_type, const void *nx_ctx,
nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
struct kern_pbufpool *rx_pp, int *err)
{
struct kern_nexus_domain_provider *nxdom_prov;
struct kern_nexus_provider *nxprov = NULL;
struct kern_nexus *nx = NULL;
#if SK_LOG
uuid_string_t uuidstr;
#endif /* SK_LOG */
NXCTL_LOCK_ASSERT_HELD(nxctl);
ASSERT(dom_type < NEXUS_TYPE_MAX);
ASSERT(!uuid_is_null(nxprov_uuid));
*err = 0;
SK_LOCK();
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
if (nxctl == nxprov->nxprov_ctl &&
uuid_compare(nxprov_uuid, nxprov->nxprov_uuid) == 0) {
break;
}
}
if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
SK_ERR("Provider not found or has been closed");
*err = ENOENT;
goto done;
}
nxdom_prov = nxprov->nxprov_dom_prov;
if (dom_type != NEXUS_TYPE_UNDEFINED &&
(nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
nxdom_prov = NULL;
nxprov = NULL;
*err = ENODEV;
goto done;
}
if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
(!tx_pp || !rx_pp)) {
#if SK_LOG
SK_ERR("TX/RX packet pool is required for netif logical link "
"nexus provider UUID: %s",
sk_uuid_unparse(nxprov_uuid, uuidstr));
#endif /* SK_LOG */
nxdom_prov = NULL;
nxprov = NULL;
*err = EINVAL;
goto done;
}
if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, tx_pp)) != 0) ||
(rx_pp != NULL && (*err = nx_check_pp(nxprov, rx_pp)) != 0)) {
goto done;
}
nx = nx_alloc(Z_WAITOK);
STAILQ_INIT(&nx->nx_ch_head);
STAILQ_INIT(&nx->nx_ch_nonxref_head);
lck_rw_init(&nx->nx_ch_if_adv_lock, &nexus_lock_group,
&nexus_lock_attr);
STAILQ_INIT(&nx->nx_ch_if_adv_head);
uuid_generate_random(nx->nx_uuid);
nx->nx_prov = nxprov;
nx->nx_ctx = (void *)(uintptr_t)nx_ctx;
nx->nx_ctx_release = nx_ctx_release;
nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
if (tx_pp != NULL) {
nx->nx_tx_pp = tx_pp;
pp_retain(tx_pp); /* released by nx_free */
}
if (rx_pp != NULL) {
nx->nx_rx_pp = rx_pp;
pp_retain(rx_pp); /* released by nx_free */
}
/* this nexus is alive; tell the nexus constructor to set it up */
if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
*err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
if (*err != 0) {
nx->nx_prov = NULL;
goto done;
}
}
nxprov_retain_locked(nxprov); /* hold a ref on the nexus reg */
STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
nxprov->nxprov_nx_count++;
RB_INSERT(kern_nexus_tree, &nx_head, nx);
os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
nx_retain_locked(nx); /* one for the provider list */
nx_retain_locked(nx); /* one for the global list */
nx_retain_locked(nx); /* one for the caller */
#if SK_LOG
SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
nxdom_prov->nxdom_prov_dom->nxdom_name,
nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
#endif /* SK_LOG */
done:
SK_UNLOCK();
if (*err != 0) {
if (nx != NULL) {
nx_free(nx);
nx = NULL;
}
}
return nx;
}
int
nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
{
struct kern_nexus *nx = NULL;
struct kern_nexus find;
int err = 0;
NXCTL_LOCK_ASSERT_HELD(nxctl);
SK_LOCK();
uuid_copy(find.nx_uuid, nx_uuid);
nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
nx = NULL;
}
if (nx != NULL) {
nx_retain_locked(nx);
}
if (nx == NULL) {
err = ENOENT;
} else {
err = nx_close(nx, TRUE);
(void) nx_release_locked(nx);
}
SK_UNLOCK();
return err;
}
static inline int
nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
{
return uuid_compare(a->nx_uuid, b->nx_uuid);
}
struct kern_nexus *
nx_find(const uuid_t nx_uuid, boolean_t locked)
{
struct kern_nexus *nx = NULL;
struct kern_nexus find;
if (!locked) {
SK_LOCK();
}
SK_LOCK_ASSERT_HELD();
uuid_copy(find.nx_uuid, nx_uuid);
nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
nx = NULL;
}
/* return reference to caller */
if (nx != NULL) {
nx_retain_locked(nx);
}
if (!locked) {
SK_UNLOCK();
}
return nx;
}
int
nx_close(struct kern_nexus *nx, boolean_t locked)
{
int err = 0;
if (!locked) {
SK_LOCK();
}
SK_LOCK_ASSERT_HELD();
if (nx->nx_flags & NXF_CLOSED) {
err = EALREADY;
} else {
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
NXF_BITS);
#endif /* SK_LOG */
if (STAILQ_EMPTY(&nx->nx_ch_head)) {
/* no regular channels open to it, so detach now */
nx_detach(nx);
} else {
/* detach when the last channel closes */
ASSERT(nx->nx_refcnt > 3);
os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
}
}
if (!locked) {
SK_UNLOCK();
}
return err;
}
void
nx_stop(struct kern_nexus *nx)
{
struct kern_nexus_provider *nxprov = nx->nx_prov;
SK_LOCK_ASSERT_HELD();
/* send a stop message */
if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
}
}
void
nx_detach(struct kern_nexus *nx)
{
struct kern_nexus_provider *nxprov = nx->nx_prov;
SK_LOCK_ASSERT_HELD();
#if SK_LOG
uuid_string_t uuidstr;
SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
#endif /* SK_LOG */
/* Caller must hold extra refs, on top of the two in reg/global lists */
ASSERT(nx->nx_refcnt >= 3);
ASSERT(nx->nx_flags & NXF_ATTACHED);
/* this nexus is done; let the nexus destructor do final cleanups */
if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
}
ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
nxprov->nxprov_nx_count--;
RB_REMOVE(kern_nexus_tree, &nx_head, nx);
os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
nx->nx_prov = NULL;
if (nx->nx_ctx_release != NULL) {
nx->nx_ctx_release(nx->nx_ctx);
}
nx->nx_ctx = NULL;
(void) nx_release_locked(nx); /* one for the reg list */
(void) nx_release_locked(nx); /* one for the global list */
/*
* If this was the last nexus and the provider has been closed,
* detach the provider and and finish up the postponed job.
*/
if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
(nxprov->nxprov_flags & NXPROVF_CLOSED)) {
nxprov_detach(nxprov, TRUE);
}
(void) nxprov_release_locked(nxprov);
}
int
nx_advisory_alloc(struct kern_nexus *nx, const char *name,
struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
{
struct __kern_nexus_adv_metadata *adv_md;
_CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
_CASSERT((sizeof(struct sk_nexusadv) +
sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
_CASSERT((sizeof(struct netif_nexus_advisory) +
sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
ASSERT(nx->nx_adv.nxv_reg == NULL);
ASSERT(nx->nx_adv.nxv_adv == NULL);
ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
type == NEXUS_ADVISORY_TYPE_NETIF);
if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
NULL, NULL, NULL)) == NULL) {
return ENOMEM;
}
nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, NULL,
NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC));
adv_md = nx->nx_adv.nxv_adv;
adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
adv_md->knam_type = type;
adv_md->__reserved = 0;
nx->nx_adv.nxv_adv_type = type;
nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
} else {
nx->nx_adv.netif_nxv_adv->nna_version =
NX_NETIF_ADVISORY_CURRENT_VERSION;
}
return 0;
}
void
nx_advisory_free(struct kern_nexus *nx)
{
if (nx->nx_adv.nxv_reg != NULL) {
ASSERT(nx->nx_adv.nxv_adv != NULL);
skmem_region_free(nx->nx_adv.nxv_reg,
nx->nx_adv.nxv_adv, NULL);
nx->nx_adv.nxv_adv = NULL;
nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
nx->nx_adv.flowswitch_nxv_adv = NULL;
skmem_region_release(nx->nx_adv.nxv_reg);
nx->nx_adv.nxv_reg = NULL;
}
ASSERT(nx->nx_adv.nxv_reg == NULL);
ASSERT(nx->nx_adv.nxv_adv == NULL);
ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
}
static struct kern_nexus *
nx_alloc(zalloc_flags_t how)
{
SK_LOCK_ASSERT_HELD();
return zalloc_flags(nx_zone, how | Z_ZERO);
}
static void
nx_free(struct kern_nexus *nx)
{
ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
nx_port_free_all(nx);
if (nx->nx_tx_pp != NULL) {
pp_release(nx->nx_tx_pp);
nx->nx_tx_pp = NULL;
}
if (nx->nx_rx_pp != NULL) {
pp_release(nx->nx_rx_pp);
nx->nx_rx_pp = NULL;
}
ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group);
SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
zfree(nx_zone, nx);
}
void
nx_retain_locked(struct kern_nexus *nx)
{
SK_LOCK_ASSERT_HELD();
nx->nx_refcnt++;
VERIFY(nx->nx_refcnt > 0);
}
void
nx_retain(struct kern_nexus *nx)
{
SK_LOCK();
nx_retain_locked(nx);
SK_UNLOCK();
}
int
nx_release_locked(struct kern_nexus *nx)
{
int oldref = nx->nx_refcnt;
SK_LOCK_ASSERT_HELD();
VERIFY(nx->nx_refcnt > 0);
if (--nx->nx_refcnt == 0) {
nx_free(nx);
}
return oldref == 1;
}
int
nx_release(struct kern_nexus *nx)
{
int lastref;
SK_LOCK_ASSERT_NOTHELD();
SK_LOCK();
lastref = nx_release_locked(nx);
SK_UNLOCK();
return lastref;
}
static int
nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
struct nexus_adapter *na = ch->ch_na;
boolean_t undo = FALSE;
int ksd_retains = 0;
enum txrx t;
int err = 0;
ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
CHANF_EXT_PRECONNECT);
if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
return 0;
}
for_rx_tx(t) {
uint32_t i;
for (i = 0; i < na_get_nrings(na, t); i++) {
struct __kern_channel_ring *kring = &NAKR(na, t)[i];
/* skip host rings */
if (kring->ckr_flags & CKRF_HOST) {
continue;
}
if ((err = nxprov->nxprov_ext.nxpi_ring_init(
nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
&kring->ckr_ctx)) != 0) {
SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
"(0x%llx) krflags %b ring_init error %d",
SK_KVA(ch), ch->ch_flags, CHANF_BITS,
SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
kring->ckr_flags, CKRF_BITS, err);
kring->ckr_ctx = NULL;
undo = TRUE;
break;
}
kring->ckr_flags |= CKRF_EXT_RING_INITED;
if ((err = nx_init_slots(nx, kring)) != 0) {
undo = TRUE;
break;
}
if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
++ksd_retains;
}
}
if (undo) {
break;
}
}
/*
* Note: retain KSD even in case of error, as we have set
* CKRF_EXT_SLOTS_INITED flag for some of the rings
* nx_fini_rings would take care of release based on it.
*/
if (ksd_retains != 0) {
/*
* Mark the kernel slot descriptor region as busy; this
* prevents it from being torn-down at channel defunct
* time, as we need to invoke the slot_fini() callback
* for each slot and we need the descriptors until then.
*/
skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(na->na_arena),
ksd_retains);
}
if (err != 0) {
ASSERT(undo);
nx_fini_rings(nx, ch);
}
return err;
}
static void
nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
struct nexus_adapter *na = ch->ch_na;
int ksd_releases = 0;
enum txrx t;
for_rx_tx(t) {
uint32_t i;
for (i = 0; i < na_get_nrings(na, t); i++) {
struct __kern_channel_ring *kring = &NAKR(na, t)[i];
if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
continue;
}
ASSERT(!(kring->ckr_flags & CKRF_HOST));
ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
++ksd_releases;
}
/*
* Undo the work done in nx_init_slots() and inform
* the external domain provider, if applicable, that
* the slots for this ring are no longer valid.
*/
nx_fini_slots(nx, kring);
kring->ckr_ctx = NULL;
}
}
if (ksd_releases != 0) {
/*
* Now that we've finished invoking the slot_fini()
* callbacks, release the busy retain counts held
* earlier in nx_init_rings(). This will allow the
* kernel slot descriptor region to be torn down.
*/
skmem_arena_nexus_sd_set_noidle(
skmem_arena_nexus(na->na_arena), -ksd_releases);
}
}
static int
nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
struct __slot_desc *slot = kring->ckr_ksds;
int err = 0;
uint32_t i;
/*
* If the slot init callback was not provided, or if the
* kring was not created to hold any slot contexts, don't
* go any further.
*/
if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
kring->ckr_slot_ctxs == NULL) {
return 0;
}
ASSERT(kring->ckr_slot_ctxs_set == 0);
ASSERT(slot != NULL);
for (i = 0; i < kring->ckr_num_slots; i++) {
struct kern_slot_prop *slot_ctx_prop = NULL;
void *slot_ctx_arg = NULL;
ASSERT(&slot[i] <= kring->ckr_ksds_last);
if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
&slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
"slot_init error %d", SK_KVA(nx), kring->ckr_name,
SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
break;
}
/* we don't want this to be used by client, so verify here */
ASSERT(slot_ctx_prop == NULL);
kring->ckr_slot_ctxs[i].slot_ctx_arg =
(mach_vm_address_t)slot_ctx_arg;
kring->ckr_slot_ctxs_set++;
}
if (err != 0) {
nx_fini_slots(nx, kring);
} else {
kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
}
return err;
}
static void
nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
{
struct kern_nexus_provider *nxprov = NX_PROV(nx);
struct __slot_desc *slot = kring->ckr_ksds;
uint32_t i;
ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
nxprov->nxprov_ext.nxpi_slot_fini != NULL);
ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
kring, &slot[i], i);
}
if (kring->ckr_slot_ctxs != NULL) {
kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
}
}
kring->ckr_slot_ctxs_set = 0;
/* We're done with this kring */
kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
}
/* 64-bit mask with range */
#define BMASK64(_beg, _end) \
((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
int
nx_port_find(struct kern_nexus *nx, nexus_port_t first,
nexus_port_t last, nexus_port_t *nx_port)
{
int err = 0;
ASSERT(first < last);
*nx_port = NEXUS_PORT_ANY;
if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
/*
* Left edge of the range is beyond the current map;
* let nx_port_alloc() handle the growing later.
*/
*nx_port = first;
} else {
nexus_port_size_t fc = (first / NX_PORT_CHUNK);
nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
nexus_port_size_t i, j;
bitmap_t *bmap;
/*
* The right edge of the range is either within or
* beyond the current map; scan thru the current
* map and find the first available port.
*/
for (i = fc; i <= lc; i++) {
bitmap_t mask;
nexus_port_size_t beg = 0, end = 63;
if (i == fc) {
beg = (first % NX_PORT_CHUNK);
}
if (i == (last / NX_PORT_CHUNK)) {
end = (last % NX_PORT_CHUNK);
}
if (i < lim) {
bmap = &nx->nx_ports_bmap[i];
mask = BMASK64(beg, end);
j = (nexus_port_size_t)ffsll((*bmap) & mask);
if (j == 0) {
continue;
}
--j;
*nx_port = (i * NX_PORT_CHUNK) + j;
}
break;
}
/*
* If the requested range is within the current map and we
* couldn't find a port, return an err. Otherwise, return
* the next port index to trigger growing later.
*/
if (*nx_port == NEXUS_PORT_ANY) {
if (lc == (last / NX_PORT_CHUNK)) {
err = EBUSY;
SK_ERR("port unavail in [%u, %u)", first, last);
} else {
*nx_port = nx->nx_num_ports;
}
}
}
SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
(int)*nx_port, err);
return err;
}
static int
nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
{
ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
struct nx_port_info *ports;
size_t limit;
nexus_port_size_t i, num_ports, old_num_ports;
bitmap_t *bmap;
ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
_CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
ASSERT(powerof2(dom_port_max));
ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
old_num_ports = nx->nx_num_ports;
num_ports = nx->nx_num_ports + grow;
limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
if (num_ports > limit) {
SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
nx->nx_num_ports, grow, num_ports, limit);
return EDOM;
}
if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
(old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
(num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
Z_WAITOK, skmem_tag_nx_port)) == NULL) {
SK_ERR("bmap alloc failed, num_port %u", num_ports);
return ENOMEM;
}
nx->nx_ports_bmap = bmap;
if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
/* can't free bmap here, otherwise nexus won't work */
SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
return ENOMEM;
}
/* initialize the additional new ports */
bzero(&ports[nx->nx_num_ports], (grow * sizeof(*ports)));
nx->nx_ports = ports;
/* initialize new bitmaps (set all bits) */
for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
i < (num_ports / NX_PORT_CHUNK); i++) {
bmap[i] = NX_PORT_CHUNK_FREE;
}
nx->nx_num_ports = num_ports;
SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
return 0;
}
int
nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
struct nexus_adapter **na, struct proc *p)
{
struct nx_port_info *npi = NULL;
struct nxbind *nxb0;
size_t g;
uint32_t i, j;
bitmap_t *bmap;
bool refonly = false;
int err = 0;
ASSERT(nx_port != NEXUS_PORT_ANY);
ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
/* port is zero-based, so adjust here */
if ((nx_port + 1) > nx->nx_num_ports) {
g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
VERIFY(g <= NEXUS_PORT_MAX);
if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
goto done;
}
}
ASSERT(err == 0);
ASSERT(nx_port < nx->nx_num_ports);
npi = &nx->nx_ports[nx_port];
nxb0 = npi->npi_nxb;
i = nx_port / NX_PORT_CHUNK;
j = nx_port % NX_PORT_CHUNK;
bmap = &nx->nx_ports_bmap[i];
if (bit_test(*bmap, j)) {
/* port is not (yet) bound or allocated */
ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
/*
* If the port allocation is requested by userland
* and the nexus is non-anonymous, then fail the
* request.
*/
err = EACCES;
SK_ERR("user proc alloc on named nexus needs binding");
} else if (na != NULL && *na != NULL) {
/*
* Otherwise claim it (clear bit) if the caller
* supplied an adapter for this port; else, it
* is just an existential check and so there's
* no action needed at this point (we'll skip
* the init below since vpna is NULL).
*/
bit_clear(*bmap, j);
}
} else {
/* if port is bound, check if credentials match */
if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
(nxb == NULL || !nxb_is_equal(nxb0, nxb))) {
SK_ERR("nexus binding mismatch");
err = EACCES;
} else {
/*
* If port is already occupied by an adapter,
* see if the client is requesting a reference
* to it; if so, return the adapter. Otherwise,
* if unoccupied and vpna is non-NULL, associate
* it with this nexus port via the below init.
*/
if (NPI_NA(npi) != NULL) {
if (na != NULL && *na == NULL) {
*na = NPI_NA(npi);
na_retain_locked(*na);
/* skip the init below */
refonly = true;
} else {
/*
* If the client supplied an adapter
* (regardless of its value) for a
* nexus port that's already occupied,
* then we fail the request.
*/
SK_ERR("nexus adapted exits");
err = EEXIST;
}
}
}
}
done:
/* initialize the nexus port and the adapter occupying it */
if (err == 0 && na != NULL && *na != NULL && !refonly) {
ASSERT(nx_port < nx->nx_num_ports);
ASSERT(npi->npi_nah == 0);
ASSERT(nx->nx_active_ports < nx->nx_num_ports);
ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
(nx_port % NX_PORT_CHUNK)));
nx->nx_active_ports++;
npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
(*na)->na_nx_port = nx_port;
}
SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
err);
return err;
}
void
nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
{
struct nx_port_info *npi = &nx->nx_ports[nx_port];
npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
NEXUS_PORT_STATE_DEFUNCT);
}
void
nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
{
struct nx_port_info *npi = NULL;
bitmap_t *bmap;
uint32_t i, j;
ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
ASSERT(nx->nx_active_ports != 0);
i = nx_port / NX_PORT_CHUNK;
j = nx_port % NX_PORT_CHUNK;
bmap = &nx->nx_ports_bmap[i];
ASSERT(!bit_test(*bmap, j));
npi = &nx->nx_ports[nx_port];
npi->npi_nah = 0;
if (npi->npi_nxb == NULL) {
/* it's vacant, release it (set bit) */
bit_set(*bmap, j);
}
nx->nx_active_ports--;
//XXX wshen0123@apple.com --- try to shrink bitmap & nx_ports ???
SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
}
int
nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
struct nxbind *nxb0, void *info)
{
struct nx_port_info *npi = NULL;
size_t g;
uint32_t i, j;
bitmap_t *bmap;
int err = 0;
ASSERT(nx_port != NEXUS_PORT_ANY);
ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
ASSERT(nxb0 != NULL);
if ((nx_port) + 1 > nx->nx_num_ports) {
g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
VERIFY(g <= NEXUS_PORT_MAX);
if ((err = nx_port_grow(nx, (nexus_port_size_t)g)) != 0) {
goto done;
}
}
ASSERT(err == 0);
npi = &nx->nx_ports[nx_port];
i = nx_port / NX_PORT_CHUNK;
j = nx_port % NX_PORT_CHUNK;
bmap = &nx->nx_ports_bmap[i];
if (bit_test(*bmap, j)) {
/* port is not (yet) bound or allocated */
ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
bit_clear(*bmap, j);
struct nxbind *nxb = nxb_alloc(Z_WAITOK);
nxb_move(nxb0, nxb);
npi->npi_nxb = nxb;
npi->npi_info = info;
/* claim it (clear bit) */
bit_clear(*bmap, j);
ASSERT(err == 0);
} else {
/* port is already taken */
ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
err = EEXIST;
}
done:
SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
"+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
(int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
return err;
}
int
nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
{
return nx_port_bind_info(nx, nx_port, nxb0, NULL);
}
static int
nx_port_info_size(void *info, size_t *sz)
{
struct nx_port_info_header *hdr = info;
switch (hdr->ih_type) {
case NX_PORT_INFO_TYPE_NETIF:
break;
default:
return EINVAL;
}
*sz = hdr->ih_size;
return 0;
}
int
nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
{
struct nx_port_info *npi = NULL;
struct nxbind *nxb;
uint32_t i, j;
bitmap_t *bmap;
int err = 0;
ASSERT(nx_port != NEXUS_PORT_ANY);
if (nx_port >= nx->nx_num_ports) {
err = EDOM;
goto done;
}
npi = &nx->nx_ports[nx_port];
i = nx_port / NX_PORT_CHUNK;
j = nx_port % NX_PORT_CHUNK;
bmap = &nx->nx_ports_bmap[i];
if ((nxb = npi->npi_nxb) == NULL) {
/* must be either free or allocated */
ASSERT(NPI_NA(npi) == NULL ||
(!bit_test(*bmap, j) && nx->nx_active_ports > 0));
err = ENOENT;
} else {
nxb_free(nxb);
npi->npi_nxb = NULL;
if (npi->npi_info != NULL) {
size_t sz;
VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
sk_free_data(npi->npi_info, sz);
npi->npi_info = NULL;
}
ASSERT(!bit_test(*bmap, j));
if (NPI_NA(npi) == NULL) {
/* it's vacant, release it (set bit) */
bit_set(*bmap, j);
}
}
done:
SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
"--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
(int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
return err;
}
struct nexus_adapter *
nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
{
if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
return NPI_NA(&nx->nx_ports[nx_port]);
} else {
return NULL;
}
}
int
nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
nx_port_info_type_t type, void *info, uint32_t len)
{
struct nx_port_info *npi;
struct nx_port_info_header *hdr;
if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
return ENXIO;
}
npi = &nx->nx_ports[port];
hdr = npi->npi_info;
if (hdr == NULL) {
return ENOENT;
}
if (hdr->ih_type != type) {
return EINVAL;
}
bcopy(npi->npi_info, info, len);
return 0;
}
bool
nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
{
return nx_port < nx->nx_num_ports;
}
bool
nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
{
ASSERT(nx_port_is_valid(nx, nx_port));
return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
}
void
nx_port_free_all(struct kern_nexus *nx)
{
uint32_t num_ports;
/* uncrustify doesn't handle C blocks properly */
/* BEGIN IGNORE CODESTYLE */
nx_port_foreach(nx, ^(nexus_port_t p) {
struct nxbind *nxb;
void *info;
nxb = nx->nx_ports[p].npi_nxb;
info = nx->nx_ports[p].npi_info;
if (nxb != NULL) {
nxb_free(nxb);
nx->nx_ports[p].npi_nxb = NULL;
}
if (info != NULL) {
size_t sz;
VERIFY(nx_port_info_size(info, &sz) == 0);
skn_free_data(info, info, sz);
nx->nx_ports[p].npi_info = NULL;
}
});
/* END IGNORE CODESTYLE */
num_ports = nx->nx_num_ports;
nx->nx_num_ports = 0;
nx->nx_active_ports = 0;
skn_free_data(ports_bmap,
nx->nx_ports_bmap, (num_ports / NX_PORT_CHUNK) * sizeof(bitmap_t));
nx->nx_ports_bmap = NULL;
sk_free_type_array(struct nx_port_info, num_ports, nx->nx_ports);
nx->nx_ports = NULL;
}
void
nx_port_foreach(struct kern_nexus *nx,
void (^port_handle)(nexus_port_t nx_port))
{
for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
bitmap_t bmap = nx->nx_ports_bmap[i];
if (bmap == NX_PORT_CHUNK_FREE) {
continue;
}
for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
if (bit_test(bmap, j)) {
continue;
}
port_handle((i * NX_PORT_CHUNK) + j);
}
}
}
/*
* sysctl interfaces
*/
static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
"A list of logical links");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
"Nexus inet flows with stats collected in kernel");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
"Nexus flow owners");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
"Nexus flow routes");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
"Nexus netif statistics collected in kernel");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
"Nexus flowswitch statistics collected in kernel");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
"Nexus userstack statistics counter");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
"Nexus flow advisory dump");
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
"A list of netif queue stats entries");
/*
* Provider list sysctl
*/
static void
nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
nexus_provider_info_t info)
{
struct kern_nexus *nx;
uuid_t *uuids;
SK_LOCK_ASSERT_HELD();
/* provider UUID + params */
uuid_copy(info->npi_prov_uuid, nxprov->nxprov_uuid);
bcopy(nxprov->nxprov_params, &info->npi_prov_params,
sizeof(struct nxprov_params));
info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
/* instance UUID list */
uuids = info->npi_instance_uuids;
STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
uuid_copy(*uuids, nx->nx_uuid);
uuids++;
}
}
static int
nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
{
#pragma unused(arg1, arg2, oidp)
size_t actual_space;
caddr_t buffer = NULL;
size_t buffer_space;
size_t allocated_space;
int out_error;
int error = 0;
struct kern_nexus_provider *nxprov;
caddr_t scan;
if (!kauth_cred_issuser(kauth_cred_get())) {
return EPERM;
}
net_update_uptime();
buffer_space = req->oldlen;
if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
buffer_space = SK_SYSCTL_ALLOC_MAX;
}
allocated_space = buffer_space;
buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
if (__improbable(buffer == NULL)) {
return ENOBUFS;
}
} else if (req->oldptr == USER_ADDR_NULL) {
buffer_space = 0;
}
actual_space = 0;
scan = buffer;
SK_LOCK();
STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
size_t info_size;
info_size
= NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
if (scan != NULL) {
if (buffer_space < info_size) {
/* supplied buffer too small, stop copying */
error = ENOMEM;
break;
}
nexus_provider_info_populate(nxprov, (void *)scan);
scan += info_size;
buffer_space -= info_size;
}
actual_space += info_size;
}
SK_UNLOCK();
out_error = SYSCTL_OUT(req, buffer, actual_space);
if (out_error != 0) {
error = out_error;
}
if (buffer != NULL) {
sk_free_data(buffer, allocated_space);
}
return error;
}
/*
* Channel list sysctl
*/
static uint32_t
channel_ring_count(struct kern_channel *ch, enum txrx which)
{
return ch->ch_last[which] - ch->ch_first[which];
}
static void
populate_ring_entries(struct __kern_channel_ring *kring,
ring_id_t first, ring_id_t last, nexus_channel_ring_entry_t entries)
{
ring_id_t i;
nexus_channel_ring_entry_t scan;
struct __kern_channel_ring *ring;
scan = entries;
for (i = first; i < last; i++, scan++) {
ring = &kring[i];
DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
ring);
if (kr_stat_enable == 0) {
bzero(&scan->ncre_stats, sizeof(scan->ncre_stats));
bzero(&scan->ncre_user_stats,
sizeof(scan->ncre_user_stats));
} else {
scan->ncre_stats = ring->ckr_stats;
scan->ncre_user_stats = ring->ckr_usr_stats;
}
scan->ncre_error_stats = ring->ckr_err_stats;
scan->ncre_ring_id = i;
}
}
/* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
static uint32_t
nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
{
uint32_t flags = 0;
flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
return flags;
}
SK_NO_INLINE_ATTRIBUTE
static void
nexus_channel_entry_populate(struct kern_channel *ch,
nexus_channel_entry_t entry)
{
uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
uint32_t ch_flags = ch->ch_flags;
ring_id_t rx_first = ch->ch_first[NR_RX];
ring_id_t rx_last = ch->ch_last[NR_RX];
ring_id_t tx_last = ch->ch_last[NR_TX];
ring_id_t tx_first = ch->ch_first[NR_TX];
uuid_copy(entry->nce_uuid, ch->ch_info->cinfo_ch_id);
entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
entry->nce_port = ch->ch_info->cinfo_nx_port;
entry->nce_pid = ch->ch_pid;
entry->nce_fd = ch->ch_fd;
entry->nce_tx_rings = tx_last - tx_first;
entry->nce_rx_rings = rx_last - rx_first;
populate_ring_entries(ch->ch_na->na_tx_rings, tx_first, tx_last,
entry->nce_ring_entries);
populate_ring_entries(ch->ch_na->na_rx_rings, rx_first, rx_last,
entry->nce_ring_entries + entry->nce_tx_rings);
}
SK_NO_INLINE_ATTRIBUTE
static size_t
nexus_channel_info_populate(struct kern_nexus *nx,
nexus_channel_info_t info, size_t buffer_size)
{
struct kern_channel *ch = NULL;
size_t info_size;
caddr_t scan = NULL;
SK_LOCK_ASSERT_HELD();
info_size = sizeof(*info);
/* channel list */
if (info != NULL) {
if (buffer_size < info_size) {
return info_size;
}
/* instance UUID */
uuid_copy(info->nci_instance_uuid, nx->nx_uuid);
info->nci_channel_entries_count = nx->nx_ch_count;
scan = (caddr_t)info->nci_channel_entries;
}
STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
size_t entry_size;
uint32_t ring_count;
ring_count = channel_ring_count(ch, NR_TX) +
channel_ring_count(ch, NR_RX);
entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
info_size += entry_size;
if (scan != NULL) {
if (buffer_size < info_size) {
return info_size;
}
nexus_channel_entry_populate(ch, (void *)scan);
scan += entry_size;
}
}
return info_size;
}
static int
nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
{
#pragma unused(arg1, arg2, oidp)
size_t actual_space;
caddr_t buffer = NULL;
size_t buffer_space;
size_t allocated_space;
int out_error;
struct kern_nexus *nx;
int error = 0;
caddr_t scan;
if (!kauth_cred_issuser(kauth_cred_get())) {
return EPERM;
}
net_update_uptime();
buffer_space = req->oldlen;
if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
buffer_space = SK_SYSCTL_ALLOC_MAX;
}
allocated_space = buffer_space;
buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
if (__improbable(buffer == NULL)) {
return ENOBUFS;
}
} else if (req->oldptr == USER_ADDR_NULL) {
buffer_space = 0;
}
actual_space = 0;
scan = buffer;
SK_LOCK();
RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
size_t info_size;
info_size = nexus_channel_info_populate(nx, (void *)scan,
buffer_space);
if (scan != NULL) {
if (buffer_space < info_size) {
/* supplied buffer too small, stop copying */
error = ENOMEM;
break;
}
scan += info_size;
buffer_space -= info_size;
}
actual_space += info_size;
}
SK_UNLOCK();
if (actual_space != 0) {
out_error = SYSCTL_OUT(req, buffer, actual_space);
if (out_error != 0) {
error = out_error;
}
}
if (buffer != NULL) {
sk_free_data(buffer, allocated_space);
}
return error;
}
static int
nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
{
#pragma unused(arg1, arg2)
struct proc *p = req->p;
struct nexus_mib_filter filter;
int error = 0;
size_t actual_space;
caddr_t buffer = NULL;
size_t buffer_space;
size_t allocated_space;
int out_error;
struct kern_nexus *nx;
caddr_t scan;
/* Restrict protocol stats access to root user only (like netstat). */
if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
!kauth_cred_issuser(kauth_cred_get())) {
SK_ERR("mib request rejected, EPERM");
return EPERM;
}
if (req->newptr == USER_ADDR_NULL) {
/*
* For flow stats requests, non-root users need to provide a
* 5-tuple. Otherwise, we do not grant access.
*/
if (oidp->oid_arg2 == NXMIB_FLOW &&
!kauth_cred_issuser(kauth_cred_get())) {
SK_ERR("mib request rejected: tuple not provided");
return EPERM;
}
/* use subcommand for multiple nodes */
filter.nmf_type = oidp->oid_arg2;
filter.nmf_bitmap = 0x0;
} else if (req->newlen != sizeof(struct nexus_mib_filter)) {
SK_ERR("mis-matching newlen");
return EINVAL;
} else {
error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
if (error != 0) {
SK_ERR("SYSCTL_IN err %d", error);
return error;
}
if (filter.nmf_type != oidp->oid_arg2) {
SK_ERR("mis-matching nmf_type");
return EINVAL;
}
/*
* For flow stats requests, non-root users need to set the nexus
* mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
* grant access. This ensures that fsw_mib_get_flow looks for a
* flow entry that matches the given tuple of the non-root user.
*/
if (filter.nmf_type == NXMIB_FLOW &&
(filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
!kauth_cred_issuser(kauth_cred_get())) {
SK_ERR("mib request rejected: tuple filter not set");
return EPERM;
}
}
net_update_uptime();
buffer_space = req->oldlen;
if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
buffer_space = SK_SYSCTL_ALLOC_MAX;
}
allocated_space = buffer_space;
buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
if (__improbable(buffer == NULL)) {
return ENOBUFS;
}
} else if (req->oldptr == USER_ADDR_NULL) {
buffer_space = 0;
}
actual_space = 0;
scan = buffer;
SK_LOCK();
RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
continue;
}
size_t size;
struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
buffer_space, p);
if (scan != NULL) {
if (buffer_space < size) {
/* supplied buffer too small, stop copying */
error = ENOMEM;
break;
}
scan += size;
buffer_space -= size;
}
actual_space += size;
}
SK_UNLOCK();
if (actual_space != 0) {
out_error = SYSCTL_OUT(req, buffer, actual_space);
if (out_error != 0) {
error = out_error;
}
}
if (buffer != NULL) {
sk_free_data(buffer, allocated_space);
}
return error;
}
void
kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
boolean_t is_sk_locked)
{
struct kern_nexus *nx = NULL;
if (!is_sk_locked) {
SK_LOCK();
} else {
SK_LOCK_ASSERT_HELD();
}
RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
(*f)(nx, arg0);
}
if (!is_sk_locked) {
SK_UNLOCK();
}
}
errno_t
kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
struct kern_pbufpool_memory_info *rx_pool_info,
struct kern_pbufpool_memory_info *tx_pool_info)
{
struct kern_pbufpool *tpp, *rpp;
struct kern_nexus *nx;
errno_t err = 0;
nx = nx_find(nx_uuid, FALSE);
if (nx == NULL) {
err = ENOENT;
goto done;
}
if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
err = ENOTSUP;
goto done;
}
err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
if (err != 0) {
goto done;
}
if ((tpp == NULL) && (rpp == NULL)) {
err = ENOENT;
goto done;
}
if (tx_pool_info != NULL) {
bzero(tx_pool_info, sizeof(*tx_pool_info));
}
if (rx_pool_info != NULL) {
bzero(rx_pool_info, sizeof(*rx_pool_info));
}
if ((tx_pool_info != NULL) && (tpp != NULL)) {
err = kern_pbufpool_get_memory_info(tpp, tx_pool_info);
if (err != 0) {
goto done;
}
}
if ((rx_pool_info != NULL) && (rpp != NULL)) {
err = kern_pbufpool_get_memory_info(rpp, rx_pool_info);
}
done:
if (nx != NULL) {
(void) nx_release(nx);
nx = NULL;
}
return err;
}
void
nx_interface_advisory_notify(struct kern_nexus *nx)
{
struct kern_channel *ch;
struct netif_stats *nifs;
struct fsw_stats *fsw_stats;
nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
if (nxdom_type == NEXUS_TYPE_NET_IF) {
nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
} else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
} else {
VERIFY(0);
__builtin_unreachable();
}
if (!lck_rw_try_lock_shared(&nx->nx_ch_if_adv_lock)) {
if (nxdom_type == NEXUS_TYPE_NET_IF) {
STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
} else {
STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
}
return;
}
/*
* if the channel is in "nx_ch_if_adv_head" list, then we can
* safely assume that the channel is not closed yet.
* In ch_close_common(), the channel is removed from the
* "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
* exclusive mode, prior to closing the channel.
*/
STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
struct nexus_adapter *na = ch->ch_na;
ASSERT(na != NULL);
na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
if (nxdom_type == NEXUS_TYPE_NET_IF) {
STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
} else {
STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
}
}
lck_rw_done(&nx->nx_ch_if_adv_lock);
}