973 lines
29 KiB
C
973 lines
29 KiB
C
/*
|
|
* Copyright (c) 2015-2023 Apple Inc. All rights reserved.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
|
|
*
|
|
* This file contains Original Code and/or Modifications of Original Code
|
|
* as defined in and that are subject to the Apple Public Source License
|
|
* Version 2.0 (the 'License'). You may not use this file except in
|
|
* compliance with the License. The rights granted to you under the License
|
|
* may not be used to create, or enable the creation or redistribution of,
|
|
* unlawful or unlicensed copies of an Apple operating system, or to
|
|
* circumvent, violate, or enable the circumvention or violation of, any
|
|
* terms of an Apple operating system software license agreement.
|
|
*
|
|
* Please obtain a copy of the License at
|
|
* http://www.opensource.apple.com/apsl/ and read it before using this file.
|
|
*
|
|
* The Original Code and all software distributed under the License are
|
|
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
|
|
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
|
|
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
|
|
* Please see the License for the specific language governing rights and
|
|
* limitations under the License.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
/*
|
|
* This module implements the flow switch for Skywalk
|
|
*
|
|
* --- FLOW SWITCH ---
|
|
*
|
|
* For each switch, a lock protects deletion of ports. When configuring
|
|
* or deleting a new port, the lock is acquired in exclusive mode (after
|
|
* holding SK_LOCK). When forwarding, the lock is acquired in shared
|
|
* mode (without SK_LOCK). The lock is held throughout the entire
|
|
* forwarding cycle, during which the thread may incur in a page fault.
|
|
* Hence it is important that sleepable shared locks are used.
|
|
*
|
|
* On the rx ring, the per-port lock is grabbed initially to reserve
|
|
* a number of slot in the ring, then the lock is released, packets are
|
|
* copied from source to destination, and then the lock is acquired again
|
|
* and the receive ring is updated. (A similar thing is done on the tx
|
|
* ring for NIC and host stack ports attached to the switch)
|
|
*
|
|
* When a netif is attached to a flowswitch, two kernel channels are opened:
|
|
* The device and host channels. The device channel provides the device
|
|
* datapath. The host channel is not used in the datapath. It is there
|
|
* only for providing some callbacks for activating the hostna (e.g.
|
|
* intercepting host packets).
|
|
*/
|
|
|
|
#include <net/bpf.h>
|
|
#include <netinet/tcp_seq.h>
|
|
#include <skywalk/os_skywalk_private.h>
|
|
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
|
|
#include <skywalk/nexus/flowswitch/fsw_var.h>
|
|
#include <skywalk/nexus/upipe/nx_user_pipe.h>
|
|
#include <skywalk/nexus/netif/nx_netif.h>
|
|
#include <skywalk/nexus/nexus_var.h>
|
|
#include <sys/protosw.h>
|
|
#include <sys/domain.h>
|
|
|
|
SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
|
|
|
|
static void nx_fsw_dom_init(struct nxdom *);
|
|
static void nx_fsw_dom_terminate(struct nxdom *);
|
|
static void nx_fsw_dom_fini(struct nxdom *);
|
|
static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
|
|
static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
|
|
struct nxbind *, void *);
|
|
static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
|
|
static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct kern_channel *, struct chreq *,
|
|
struct kern_channel *, struct nxbind *, struct proc *);
|
|
static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct kern_channel *);
|
|
static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct kern_channel *, struct proc *);
|
|
static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct kern_channel *, boolean_t);
|
|
|
|
static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
|
|
static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
|
|
const struct nxprov_params *, struct nxprov_adjusted_params *);
|
|
static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
|
|
const uint32_t, const struct nxprov_params *, struct nxprov_params *,
|
|
struct skmem_region_params[SKMEM_REGIONS], uint32_t);
|
|
static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct nexus_adapter *);
|
|
static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
|
|
struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
|
|
kauth_cred_t);
|
|
static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
|
|
static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
|
|
static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
|
|
static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
|
|
struct nexus_mib_filter *, void *, size_t, struct proc *);
|
|
|
|
struct nxdom nx_flowswitch_dom_s = {
|
|
.nxdom_prov_head =
|
|
STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
|
|
.nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
|
|
.nxdom_md_type = NEXUS_META_TYPE_PACKET,
|
|
.nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
|
|
.nxdom_name = "flowswitch",
|
|
.nxdom_ports = {
|
|
.nb_def = NX_FSW_VP_MAX,
|
|
.nb_min = NX_FSW_VP_MIN,
|
|
.nb_max = NX_FSW_VP_MAX,
|
|
},
|
|
.nxdom_tx_rings = {
|
|
.nb_def = 1,
|
|
.nb_min = 1,
|
|
.nb_max = NX_FSW_MAXRINGS,
|
|
},
|
|
.nxdom_rx_rings = {
|
|
.nb_def = 1,
|
|
.nb_min = 1,
|
|
.nb_max = NX_FSW_MAXRINGS,
|
|
},
|
|
.nxdom_tx_slots = {
|
|
.nb_def = NX_FSW_TXRINGSIZE,
|
|
.nb_min = NX_FSW_MINSLOTS,
|
|
.nb_max = NX_FSW_MAXSLOTS,
|
|
},
|
|
.nxdom_rx_slots = {
|
|
.nb_def = NX_FSW_RXRINGSIZE,
|
|
.nb_min = NX_FSW_MINSLOTS,
|
|
.nb_max = NX_FSW_MAXSLOTS,
|
|
},
|
|
.nxdom_buf_size = {
|
|
.nb_def = NX_FSW_BUFSIZE,
|
|
.nb_min = NX_FSW_MINBUFSIZE,
|
|
.nb_max = NX_FSW_MAXBUFSIZE,
|
|
},
|
|
.nxdom_large_buf_size = {
|
|
.nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
|
|
.nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
|
|
.nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
|
|
},
|
|
.nxdom_meta_size = {
|
|
.nb_def = NX_FSW_UMD_SIZE,
|
|
.nb_min = NX_FSW_UMD_SIZE,
|
|
.nb_max = NX_METADATA_USR_MAX_SZ,
|
|
},
|
|
.nxdom_stats_size = {
|
|
.nb_def = 0,
|
|
.nb_min = 0,
|
|
.nb_max = NX_STATS_MAX_SZ,
|
|
},
|
|
.nxdom_pipes = {
|
|
.nb_def = 0,
|
|
.nb_min = 0,
|
|
.nb_max = NX_UPIPE_MAXPIPES,
|
|
},
|
|
.nxdom_flowadv_max = {
|
|
.nb_def = 0,
|
|
.nb_min = 0,
|
|
.nb_max = NX_FLOWADV_MAX,
|
|
},
|
|
.nxdom_nexusadv_size = {
|
|
.nb_def = 0,
|
|
.nb_min = 0,
|
|
.nb_max = NX_NEXUSADV_MAX_SZ,
|
|
},
|
|
.nxdom_capabilities = {
|
|
.nb_def = NXPCAP_USER_CHANNEL,
|
|
.nb_min = 0,
|
|
.nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
|
|
NXPCAP_USER_CHANNEL),
|
|
},
|
|
.nxdom_qmap = {
|
|
.nb_def = NEXUS_QMAP_TYPE_INVALID,
|
|
.nb_min = NEXUS_QMAP_TYPE_INVALID,
|
|
.nb_max = NEXUS_QMAP_TYPE_INVALID,
|
|
},
|
|
.nxdom_max_frags = {
|
|
.nb_def = NX_PBUF_FRAGS_DEFAULT,
|
|
.nb_min = NX_PBUF_FRAGS_MIN,
|
|
.nb_max = NX_PBUF_FRAGS_MAX,
|
|
},
|
|
.nxdom_init = nx_fsw_dom_init,
|
|
.nxdom_terminate = nx_fsw_dom_terminate,
|
|
.nxdom_fini = nx_fsw_dom_fini,
|
|
.nxdom_connect = nx_fsw_dom_connect,
|
|
.nxdom_find_port = nx_fsw_dom_find_port,
|
|
.nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
|
|
.nxdom_bind_port = nx_fsw_dom_bind_port,
|
|
.nxdom_unbind_port = nx_fsw_dom_unbind_port,
|
|
.nxdom_disconnect = nx_fsw_dom_disconnect,
|
|
.nxdom_defunct = nx_fsw_dom_defunct,
|
|
.nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
|
|
};
|
|
|
|
struct kern_nexus_domain_provider nx_fsw_prov_s = {
|
|
.nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
|
|
.nxdom_prov_flags = NXDOMPROVF_DEFAULT,
|
|
.nxdom_prov_cb = {
|
|
.dp_cb_init = nx_fsw_prov_init,
|
|
.dp_cb_fini = nx_fsw_prov_fini,
|
|
.dp_cb_params = nx_fsw_prov_params,
|
|
.dp_cb_mem_new = nx_fsw_prov_mem_new,
|
|
.dp_cb_config = nx_fsw_prov_config,
|
|
.dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
|
|
.dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
|
|
.dp_cb_nx_mem_info = NULL, /* not supported */
|
|
.dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
|
|
.dp_cb_nx_stop = NULL,
|
|
},
|
|
};
|
|
|
|
|
|
static void
|
|
nx_fsw_dom_init(struct nxdom *nxdom)
|
|
{
|
|
SK_LOCK_ASSERT_HELD();
|
|
ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
|
|
|
|
/* Generic initialization */
|
|
fsw_init();
|
|
fsw_dp_init();
|
|
|
|
(void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
|
|
}
|
|
|
|
static void
|
|
nx_fsw_dom_terminate(struct nxdom *nxdom)
|
|
{
|
|
struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
|
|
nxdom_prov_link, tnxdp) {
|
|
(void) nxdom_prov_del(nxdom_prov);
|
|
}
|
|
|
|
fsw_dp_uninit();
|
|
|
|
/* Generic uninitialization */
|
|
fsw_uninit();
|
|
}
|
|
|
|
static void
|
|
nx_fsw_dom_fini(struct nxdom *nxdom)
|
|
{
|
|
#pragma unused(nxdom)
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
|
|
const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
|
|
{
|
|
#pragma unused(nxdom_prov, nxp)
|
|
_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
|
|
_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
|
|
|
|
*(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
|
|
*(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
|
|
VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
|
|
*(adj->adj_flowadv_max) = sk_max_flows;
|
|
*(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
|
|
*(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
|
|
if (sk_cksum_tx != 0) {
|
|
*(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
|
|
}
|
|
*(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
|
|
((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
|
|
2 : 1;
|
|
*(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
|
|
NX_FSW_AFRINGSIZE;
|
|
|
|
if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
|
|
(*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
|
|
*(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
|
|
}
|
|
|
|
if (*(adj->adj_max_frags) > 1) {
|
|
uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
|
|
NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
|
|
uint32_t magazine_max_objs;
|
|
|
|
*(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
|
|
sk_fsw_max_bufs : fsw_maxbufs;
|
|
|
|
/*
|
|
* Given that packet objects are the ones cached, use the
|
|
* metadata size to determine the extra amount of objects
|
|
* at magazine layer.
|
|
*/
|
|
magazine_max_objs = skmem_cache_magazine_max(
|
|
NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
|
|
METADATA_PREAMBLE_SZ);
|
|
|
|
/*
|
|
* Adjust the max buffers to account for the increase
|
|
* associated with per-CPU caching.
|
|
*/
|
|
if (skmem_allow_magazines() &&
|
|
magazine_max_objs < *(adj->adj_max_buffers)) {
|
|
*(adj->adj_max_buffers) -= magazine_max_objs;
|
|
}
|
|
}
|
|
if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
|
|
(*(adj->adj_max_frags) <= 1)) {
|
|
*(adj->adj_large_buf_size) = 0;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
|
|
const uint32_t req, const struct nxprov_params *nxp0,
|
|
struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
|
|
uint32_t pp_region_config_flags)
|
|
{
|
|
struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
|
|
|
|
/* USD regions need to be writable to support user packet pool */
|
|
srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
|
|
srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
|
|
|
|
return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
|
|
nxdom, nxdom, nxdom, pp_region_config_flags,
|
|
nx_fsw_prov_params_adjust);
|
|
}
|
|
|
|
static void
|
|
fsw_vp_region_params_setup(struct nexus_adapter *na, struct skmem_region_params *srp0,
|
|
struct skmem_region_params *srp)
|
|
{
|
|
int i;
|
|
uint32_t totalrings, nslots, afslots, evslots, lbaslots;
|
|
|
|
/* copy default flowswitch parameters initialized in nxprov_params_adjust() */
|
|
for (i = 0; i < SKMEM_REGIONS; i++) {
|
|
srp[i] = srp0[i];
|
|
}
|
|
/* customize parameters that could vary across NAs */
|
|
totalrings = na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_RX) +
|
|
na_get_nrings(na, NR_A) + na_get_nrings(na, NR_F) +
|
|
na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
|
|
|
|
srp[SKMEM_REGION_SCHEMA].srp_r_obj_size =
|
|
(uint32_t)CHANNEL_SCHEMA_SIZE(totalrings);
|
|
srp[SKMEM_REGION_SCHEMA].srp_r_obj_cnt = totalrings;
|
|
skmem_region_params_config(&srp[SKMEM_REGION_SCHEMA]);
|
|
|
|
srp[SKMEM_REGION_RING].srp_r_obj_size =
|
|
sizeof(struct __user_channel_ring);
|
|
srp[SKMEM_REGION_RING].srp_r_obj_cnt = totalrings;
|
|
skmem_region_params_config(&srp[SKMEM_REGION_RING]);
|
|
|
|
nslots = na_get_nslots(na, NR_TX);
|
|
afslots = na_get_nslots(na, NR_A);
|
|
evslots = na_get_nslots(na, NR_EV);
|
|
lbaslots = na_get_nslots(na, NR_LBA);
|
|
srp[SKMEM_REGION_TXAKSD].srp_r_obj_size =
|
|
MAX(MAX(MAX(nslots, afslots), evslots), lbaslots) * SLOT_DESC_SZ;
|
|
srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt =
|
|
na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_A) +
|
|
na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
|
|
skmem_region_params_config(&srp[SKMEM_REGION_TXAKSD]);
|
|
|
|
/* USD and KSD objects share the same size and count */
|
|
srp[SKMEM_REGION_TXAUSD].srp_r_obj_size =
|
|
srp[SKMEM_REGION_TXAKSD].srp_r_obj_size;
|
|
srp[SKMEM_REGION_TXAUSD].srp_r_obj_cnt =
|
|
srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt;
|
|
skmem_region_params_config(&srp[SKMEM_REGION_TXAUSD]);
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct nexus_adapter *na)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
int err = 0;
|
|
struct skmem_region_params *srp0 = NX_PROV(nx)->nxprov_region_params;
|
|
struct skmem_region_params srp[SKMEM_REGIONS];
|
|
|
|
SK_DF(SK_VERB_FSW,
|
|
"nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
|
|
NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
|
|
SK_KVA(na));
|
|
|
|
ASSERT(na->na_type == NA_FLOWSWITCH_VP);
|
|
ASSERT(na->na_arena == NULL);
|
|
ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
|
|
|
|
fsw_vp_region_params_setup(na, srp0, srp);
|
|
/*
|
|
* Each port in the flow switch is isolated from one another;
|
|
* use NULL for the packet buffer pool references to indicate
|
|
* this, since otherwise we'd be sharing the same pp for the
|
|
* entire switch (maybe for a future, special use case?)
|
|
*
|
|
* This means that clients calling kern_nexus_get_pbufpool()
|
|
* will get NULL, but this is fine based on current design
|
|
* of providing port isolation, and also since we don't expose
|
|
* the flow switch to external kernel clients.
|
|
*/
|
|
na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
|
|
!NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
|
|
ASSERT(na->na_arena != NULL || err != 0);
|
|
return err;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
|
|
struct proc *p, kauth_cred_t cred)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
struct sockopt sopt;
|
|
int err = 0;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
if (ncr->nc_req == USER_ADDR_NULL) {
|
|
err = EINVAL;
|
|
goto done;
|
|
}
|
|
|
|
/* to make life easier for handling copies */
|
|
bzero(&sopt, sizeof(sopt));
|
|
sopt.sopt_dir = sopt_dir;
|
|
sopt.sopt_val = ncr->nc_req;
|
|
sopt.sopt_valsize = ncr->nc_req_len;
|
|
sopt.sopt_p = p;
|
|
|
|
/* avoid _MALLOCing at the cost of this ugly switch block */
|
|
switch (ncr->nc_cmd) {
|
|
case NXCFG_CMD_ATTACH:
|
|
case NXCFG_CMD_DETACH: {
|
|
/* proceed only if the client possesses flow switch entitlement */
|
|
if (cred == NULL || (err = skywalk_priv_check_cred(p, cred,
|
|
PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
|
|
SK_ERR("missing nxctl credential");
|
|
err = EPERM;
|
|
goto done;
|
|
}
|
|
|
|
struct nx_spec_req nsr;
|
|
bzero(&nsr, sizeof(nsr));
|
|
err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Null-terminate in case this has an interface name;
|
|
* the union is already large enough for uuid_t.
|
|
*/
|
|
nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
|
|
if (p != kernproc) {
|
|
nsr.nsr_flags &= NXSPECREQ_MASK;
|
|
}
|
|
|
|
err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
|
|
err = sooptcopyout(&sopt, &nsr, sizeof(nsr));
|
|
break;
|
|
}
|
|
|
|
case NXCFG_CMD_FLOW_ADD:
|
|
case NXCFG_CMD_FLOW_DEL: {
|
|
/* need to have owner nxctl or kernnxctl */
|
|
if (cred == NULL) {
|
|
SK_ERR("missing nxctl credential");
|
|
err = EPERM;
|
|
goto done;
|
|
}
|
|
} /* fall through */
|
|
case NXCFG_CMD_FLOW_CONFIG: {
|
|
/* checks flow PID ownership instead of nxctl creditial */
|
|
struct nx_flow_req nfr;
|
|
bzero(&nfr, sizeof(nfr));
|
|
err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
|
|
err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
|
|
err = sooptcopyout(&sopt, &nfr, sizeof(nfr));
|
|
break;
|
|
}
|
|
|
|
case NXCFG_CMD_NETEM: {
|
|
struct if_netem_params inp;
|
|
|
|
bzero(&inp, sizeof(inp));
|
|
err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
|
|
if (err != 0) {
|
|
goto done;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
err = EINVAL;
|
|
goto done;
|
|
}
|
|
|
|
done:
|
|
SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
|
|
"nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
|
|
NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
|
|
return err;
|
|
}
|
|
|
|
static void
|
|
nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
|
|
}
|
|
|
|
static int
|
|
nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
|
|
{
|
|
struct nx_flowswitch *fsw;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
ASSERT(nx->nx_arg == NULL);
|
|
|
|
SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
|
|
|
|
fsw = fsw_alloc(Z_WAITOK);
|
|
nx->nx_arg = fsw;
|
|
fsw->fsw_nx = nx;
|
|
fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
|
|
fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
|
|
|
|
FSW_WLOCK(fsw);
|
|
|
|
fsw_dp_ctor(fsw);
|
|
|
|
FSW_WUNLOCK(fsw);
|
|
|
|
SK_D("create new fsw 0x%llx for nexus 0x%llx",
|
|
SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
|
|
{
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
int err;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
|
|
NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
|
|
|
|
err = fsw_ctl_detach(nx, current_proc(), NULL);
|
|
ASSERT(err == 0); /* this cannot fail */
|
|
ASSERT(fsw->fsw_dev_ch == NULL);
|
|
ASSERT(fsw->fsw_host_ch == NULL);
|
|
|
|
SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
|
|
fsw_free(fsw);
|
|
nx->nx_arg = NULL;
|
|
}
|
|
|
|
static size_t
|
|
nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
|
|
void *out, size_t len, struct proc *p)
|
|
{
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
|
|
/* this check doesn't require holding fsw_lock */
|
|
if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
|
|
(uuid_compare(filter->nmf_nx_uuid,
|
|
fsw->fsw_nx->nx_uuid)) != 0) {
|
|
return 0;
|
|
}
|
|
|
|
/* intercept NXMIB_FSW_STATS here since it's for flowswitch */
|
|
FSW_RLOCK(fsw);
|
|
len = fsw_mib_get(fsw, filter, out, len, p);
|
|
FSW_UNLOCK(fsw);
|
|
|
|
return len;
|
|
}
|
|
|
|
boolean_t
|
|
nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
|
|
{
|
|
#pragma unused(nx)
|
|
return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
|
|
nexus_port_t *nx_port)
|
|
{
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
nexus_port_t first, last, port;
|
|
int error;
|
|
|
|
ASSERT(nx_port != NULL);
|
|
|
|
port = *nx_port;
|
|
ASSERT(port == NEXUS_PORT_ANY);
|
|
|
|
if (rsvd) {
|
|
first = 0;
|
|
last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
|
|
} else {
|
|
first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
|
|
ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
|
|
last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
|
|
}
|
|
ASSERT(first <= last);
|
|
|
|
FSW_WLOCK(fsw);
|
|
if (__improbable(first == last)) {
|
|
error = ENOSPC;
|
|
} else {
|
|
error = nx_port_find(nx, first, last - 1, &port);
|
|
ASSERT(error != 0 || (port >= first && port < last));
|
|
}
|
|
FSW_WUNLOCK(fsw);
|
|
|
|
SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
|
|
"nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
|
|
nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
|
|
(int)port, first, (last - 1), error);
|
|
|
|
if (error == 0) {
|
|
*nx_port = port;
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
|
|
struct nxbind *nxb, void *info)
|
|
{
|
|
#pragma unused(info)
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
nexus_port_t first, last, port;
|
|
int error;
|
|
|
|
ASSERT(nx_port != NULL);
|
|
ASSERT(nxb != NULL);
|
|
|
|
port = *nx_port;
|
|
|
|
/* can't bind reserved ports to client credentials */
|
|
if (nx_fsw_dom_port_is_reserved(nx, port)) {
|
|
return EDOM;
|
|
}
|
|
|
|
/*
|
|
* Allow clients to bind to regular ports (non-reserved);
|
|
* reserved ports aren't subject to bind/unbind, since
|
|
* they are used for internal purposes.
|
|
*/
|
|
first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
|
|
ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
|
|
last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
|
|
ASSERT(first <= last);
|
|
|
|
FSW_WLOCK(fsw);
|
|
if (__improbable(first == last)) {
|
|
error = ENOSPC;
|
|
} else if (port != NEXUS_PORT_ANY) {
|
|
error = nx_port_bind(nx, port, nxb);
|
|
} else {
|
|
error = nx_port_find(nx, first, last - 1, &port);
|
|
ASSERT(error != 0 || (port >= first && port < last));
|
|
if (error == 0) {
|
|
error = nx_port_bind(nx, port, nxb);
|
|
}
|
|
}
|
|
FSW_WUNLOCK(fsw);
|
|
|
|
SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
|
|
"nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
|
|
nx->nx_prov->nxprov_params->nxp_name, (int)port,
|
|
first, (last - 1), error);
|
|
|
|
ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
|
|
if (error == 0) {
|
|
*nx_port = port;
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
|
|
{
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
int error;
|
|
|
|
FSW_WLOCK(fsw);
|
|
error = nx_port_unbind(nx, nx_port);
|
|
FSW_WUNLOCK(fsw);
|
|
|
|
SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
|
|
"nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
|
|
nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
|
|
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
|
|
struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
nexus_port_t port = chr->cr_port;
|
|
int err = 0;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
|
|
nxdom_prov->nxdom_prov_dom->nxdom_type &&
|
|
nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
|
|
ASSERT(!(ch->ch_flags & CHANF_HOST));
|
|
ASSERT(!(ch->ch_flags & CHANF_KERNEL));
|
|
|
|
if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
|
|
err = EDOM;
|
|
goto done;
|
|
}
|
|
|
|
chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
|
|
ASSERT(port != NEXUS_PORT_ANY);
|
|
(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
|
|
"%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
|
|
chr->cr_ring_set = RING_SET_DEFAULT;
|
|
err = na_connect(nx, ch, chr, ch0, nxb, p);
|
|
|
|
done:
|
|
return err;
|
|
}
|
|
|
|
static void
|
|
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct kern_channel *ch)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
SK_LOCK_ASSERT_HELD();
|
|
|
|
SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
|
|
SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
|
|
ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
|
|
|
|
if (ch->ch_flags & CHANF_KERNEL) {
|
|
na_disconnect_spec(nx, ch);
|
|
} else {
|
|
na_disconnect(nx, ch);
|
|
}
|
|
}
|
|
|
|
static void
|
|
nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
|
|
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
|
|
ASSERT(!(ch->ch_flags & CHANF_KERNEL));
|
|
ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
|
|
|
|
/*
|
|
* Hold the flowswitch lock as writer; this prevents all data path
|
|
* accesses to the flowswitch, and allows us to mark the rings with
|
|
* CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
|
|
* doesn't utilize kr_{enter,exit} for serialization, at present.
|
|
*/
|
|
FSW_WLOCK(fsw);
|
|
na_ch_rings_defunct(ch, p);
|
|
FSW_WUNLOCK(fsw);
|
|
}
|
|
|
|
static void
|
|
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
|
|
struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
|
|
{
|
|
#pragma unused(nxdom_prov)
|
|
struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
|
|
int err = 0;
|
|
|
|
if (!locked) {
|
|
SK_LOCK_ASSERT_NOTHELD();
|
|
SK_LOCK();
|
|
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
|
|
} else {
|
|
SK_LOCK_ASSERT_HELD();
|
|
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
|
|
}
|
|
|
|
ASSERT(!(ch->ch_flags & CHANF_KERNEL));
|
|
ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
|
|
ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
|
|
|
|
err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
|
|
|
|
if (err == 0) {
|
|
na_defunct(nx, ch, ch->ch_na, locked);
|
|
}
|
|
|
|
SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
|
|
ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
|
|
nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
|
|
ch->ch_info->cinfo_nx_port,
|
|
(int)ch->ch_info->cinfo_ch_ring_id, err);
|
|
|
|
if (!locked) {
|
|
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
|
|
SK_UNLOCK();
|
|
} else {
|
|
LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
|
|
SK_LOCK_ASSERT_HELD();
|
|
}
|
|
}
|
|
|
|
#if SK_LOG
|
|
/* Hoisted out of line to reduce kernel stack footprint */
|
|
SK_LOG_ATTRIBUTE
|
|
static void
|
|
nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
|
|
{
|
|
uuid_string_t uuidstr;
|
|
|
|
SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
|
|
"ring_id %d ring_set %u ep_type %u:%u create %u%s",
|
|
chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
|
|
(int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
|
|
(int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
|
|
chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
|
|
sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
|
|
}
|
|
#endif /* SK_LOG */
|
|
|
|
/*
|
|
* Try to get a reference to a Nexus adapter attached to a flow switch.
|
|
* If the adapter is found (or is created), this function returns 0, a
|
|
* non NULL pointer is returned into *na, and the caller holds a
|
|
* reference to the adapter.
|
|
* If an adapter is not found, then no reference is grabbed and the
|
|
* function returns an error code, or 0 if there is just a flow switch prefix
|
|
* mismatch. Therefore the caller holds a reference when
|
|
* (*na != NULL && return == 0).
|
|
*/
|
|
int
|
|
nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
|
|
struct chreq *chr, struct nxbind *nxb, struct proc *p,
|
|
struct nexus_adapter **na, boolean_t create)
|
|
{
|
|
#pragma unused(ch)
|
|
struct nexus_vp_adapter *vpna = NULL;
|
|
char *cr_name = chr->cr_name;
|
|
struct nx_flowswitch *fsw;
|
|
int error = 0;
|
|
|
|
SK_LOCK_ASSERT_HELD();
|
|
*na = NULL; /* default return value */
|
|
|
|
#if SK_LOG
|
|
if (__improbable(sk_verbose != 0)) {
|
|
nx_fsw_na_find_log(chr, create);
|
|
}
|
|
#endif /* SK_LOG */
|
|
|
|
/* first try to see if this is a flow switch port. */
|
|
if (strncmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
|
|
return 0; /* no error, but no flow switch prefix */
|
|
}
|
|
ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
|
|
fsw = NX_FSW_PRIVATE(nx);
|
|
ASSERT(fsw != NULL);
|
|
|
|
if (!create) {
|
|
return ENXIO;
|
|
}
|
|
|
|
/*
|
|
* The flowswitch VP is only attachable from a user channel so none of
|
|
* these flags should be set.
|
|
*/
|
|
ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
|
|
error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
|
|
ASSERT(vpna == NULL || error == 0);
|
|
|
|
if (error == 0) {
|
|
/* use reference held by nx_fsw_attach_vp above */
|
|
*na = &vpna->vpna_up;
|
|
SK_DF(SK_VERB_FSW,
|
|
"vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
|
|
(*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
|
|
cr_name, (int)vpna->vpna_nx_port);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
int
|
|
nx_fsw_netagent_add(struct kern_nexus *nx)
|
|
{
|
|
return fsw_netagent_add_remove(nx, TRUE);
|
|
}
|
|
|
|
int
|
|
nx_fsw_netagent_remove(struct kern_nexus *nx)
|
|
{
|
|
return fsw_netagent_add_remove(nx, FALSE);
|
|
}
|
|
|
|
void
|
|
nx_fsw_netagent_update(struct kern_nexus *nx)
|
|
{
|
|
fsw_netagent_update(nx);
|
|
}
|