4796 lines
134 KiB
C
4796 lines
134 KiB
C
/*
|
|
* Copyright (c) 2015-2023 Apple Inc. All rights reserved.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
|
|
*
|
|
* This file contains Original Code and/or Modifications of Original Code
|
|
* as defined in and that are subject to the Apple Public Source License
|
|
* Version 2.0 (the 'License'). You may not use this file except in
|
|
* compliance with the License. The rights granted to you under the License
|
|
* may not be used to create, or enable the creation or redistribution of,
|
|
* unlawful or unlicensed copies of an Apple operating system, or to
|
|
* circumvent, violate, or enable the circumvention or violation of, any
|
|
* terms of an Apple operating system software license agreement.
|
|
*
|
|
* Please obtain a copy of the License at
|
|
* http://www.opensource.apple.com/apsl/ and read it before using this file.
|
|
*
|
|
* The Original Code and all software distributed under the License are
|
|
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
|
|
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
|
|
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
|
|
* Please see the License for the specific language governing rights and
|
|
* limitations under the License.
|
|
*
|
|
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* BSD LICENSE
|
|
*
|
|
* Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of NEC Europe Ltd. nor the names of
|
|
* its contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <skywalk/os_skywalk_private.h>
|
|
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
|
|
#include <skywalk/nexus/flowswitch/fsw_var.h>
|
|
#include <skywalk/nexus/netif/nx_netif.h>
|
|
#include <skywalk/nexus/netif/nx_netif_compat.h>
|
|
#include <kern/sched_prim.h>
|
|
#include <sys/kdebug.h>
|
|
#include <sys/sdt.h>
|
|
#include <net/bpf.h>
|
|
#include <net/if_ports_used.h>
|
|
#include <net/pktap.h>
|
|
#include <net/pktsched/pktsched_netem.h>
|
|
#include <netinet/tcp.h>
|
|
#include <netinet/udp.h>
|
|
#include <netinet/ip.h>
|
|
#include <netinet/ip6.h>
|
|
#include <netinet/in_var.h>
|
|
|
|
extern kern_return_t thread_terminate(thread_t);
|
|
|
|
#define FSW_ZONE_MAX 256
|
|
#define FSW_ZONE_NAME "skywalk.nx.fsw"
|
|
|
|
static uint64_t fsw_reap_last __sk_aligned(8);
|
|
static uint64_t fsw_want_purge __sk_aligned(8);
|
|
|
|
#define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
|
|
static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
|
|
|
|
#define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
|
|
static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
|
|
|
|
#define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
|
|
static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
|
|
|
|
#define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
|
|
static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
|
|
|
|
#define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
|
|
static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
|
|
|
|
#define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
|
|
static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
|
|
|
|
#define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
|
|
#define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
|
|
#define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
|
|
#define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
|
|
#define FSW_IFSTATS_THRES 1
|
|
|
|
#define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
|
|
uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
|
|
|
|
#define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
|
|
|
|
uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
|
|
uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
|
|
uint32_t fsw_gso_batch = 8;
|
|
#if (DEVELOPMENT || DEBUG)
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
|
|
"flowswitch Rx batch size");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
|
|
"flowswitch Tx batch size");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
|
|
"flowswitch GSO batch size");
|
|
SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
|
|
"flowswitch channel reap threshold throughput (bytes/sec)");
|
|
#endif /* !DEVELOPMENT && !DEBUG */
|
|
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
|
|
"flowswitch RX aggregation for tcp flows (enable/disable)");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
|
|
"flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
|
|
"flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
|
|
|
|
/*
|
|
* IP reassembly
|
|
* The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
|
|
* enable/disable the reassembly routine regardless of whether the
|
|
* transport netagent is enabled or not.
|
|
*
|
|
* 'fsw_ip_reass' is a tri-state:
|
|
* 0 means force IP reassembly off
|
|
* 1 means force IP reassembly on
|
|
* 2 means don't force the value, use what's appropriate for this flowswitch
|
|
*/
|
|
#define FSW_IP_REASS_FORCE_OFF 0
|
|
#define FSW_IP_REASS_FORCE_ON 1
|
|
#define FSW_IP_REASS_NO_FORCE 2
|
|
|
|
uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
|
|
|
|
static int
|
|
fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
|
|
{
|
|
#pragma unused(oidp, arg1, arg2)
|
|
unsigned int new_value;
|
|
int changed;
|
|
int error;
|
|
|
|
error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
|
|
&new_value, &changed);
|
|
if (error == 0 && changed != 0) {
|
|
if (new_value > FSW_IP_REASS_NO_FORCE) {
|
|
return EINVAL;
|
|
}
|
|
fsw_ip_reass = new_value;
|
|
}
|
|
return error;
|
|
}
|
|
|
|
SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
|
|
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
|
|
0, 0, fsw_ip_reass_sysctl, "IU",
|
|
"adjust flowswitch IP reassembly");
|
|
|
|
#if (DEVELOPMENT || DEBUG)
|
|
static uint64_t _fsw_inject_error = 0;
|
|
#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
|
|
_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
|
|
&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
|
|
|
|
#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
|
|
if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
|
|
SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
|
|
if ((_f) != NULL) \
|
|
(_f)(__VA_ARGS__); \
|
|
} \
|
|
} while (0)
|
|
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
|
|
flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
|
|
&fsw_flow_route_id_buckets, 0, "");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
|
|
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
|
|
SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
|
|
CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
|
|
#else
|
|
#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
|
|
#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
|
|
#endif /* !DEVELOPMENT && !DEBUG */
|
|
|
|
static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
|
|
struct flow_entry *);
|
|
static void fsw_reap_thread_func(void *, wait_result_t);
|
|
static void fsw_reap_thread_cont(void *, wait_result_t);
|
|
static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
|
|
static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
|
|
static uint32_t fsw_process_deferred(struct nx_flowswitch *);
|
|
static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
|
|
|
|
static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
|
|
struct __kern_packet *);
|
|
|
|
static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
|
|
static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
|
|
uint32_t, uint32_t);
|
|
|
|
static int __fsw_dp_inited = 0;
|
|
|
|
int
|
|
fsw_dp_init(void)
|
|
{
|
|
_CASSERT(FSW_VP_DEV == 0);
|
|
_CASSERT(FSW_VP_HOST == 1);
|
|
_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
|
|
_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
|
|
|
|
ASSERT(!__fsw_dp_inited);
|
|
|
|
flow_mgr_init();
|
|
flow_init();
|
|
|
|
__fsw_dp_inited = 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
fsw_dp_uninit(void)
|
|
{
|
|
if (__fsw_dp_inited) {
|
|
flow_fini();
|
|
flow_mgr_fini();
|
|
|
|
__fsw_dp_inited = 0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
|
|
{
|
|
pp_free_pktq(pktq);
|
|
}
|
|
|
|
#define dp_drop_pktq(fsw, pktq) do { \
|
|
uint32_t _len = KPKTQ_LEN(pktq); \
|
|
if (KPKTQ_EMPTY(pktq)) { \
|
|
ASSERT(_len == 0); \
|
|
return; \
|
|
} \
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
|
|
FSW_STATS_ADD(FSW_STATS_DROP, _len); \
|
|
DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
|
|
dp_free_pktq(fsw, pktq); \
|
|
} while (0)
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
void
|
|
fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
|
|
{
|
|
pid_t pid;
|
|
char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
|
|
char *proc_name = NULL;
|
|
pid_t epid;
|
|
char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
|
|
char *eproc_name = NULL;
|
|
sa_family_t af;
|
|
bool tap_early = false;
|
|
struct __kern_packet *pkt;
|
|
|
|
ASSERT(fe != NULL);
|
|
ASSERT(fsw->fsw_ifp != NULL);
|
|
|
|
if (fe->fe_nx_port == FSW_VP_HOST) {
|
|
/* allow packets to be tapped before aggregation happens */
|
|
tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
|
|
if (!tap_early) {
|
|
/* all other traffic will be tapped in the dlil input path */
|
|
return;
|
|
}
|
|
}
|
|
if (fe->fe_key.fk_ipver == IPVERSION) {
|
|
af = AF_INET;
|
|
} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
|
|
af = AF_INET6;
|
|
} else {
|
|
return;
|
|
}
|
|
|
|
pid = fe->fe_pid;
|
|
if (fe->fe_proc_name[0] != '\0') {
|
|
(void) strlcpy(proc_name_buf, fe->fe_proc_name,
|
|
sizeof(proc_name_buf));
|
|
proc_name = proc_name_buf;
|
|
}
|
|
epid = fe->fe_epid;
|
|
if (fe->fe_eproc_name[0] != '\0') {
|
|
(void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
|
|
sizeof(eproc_name_buf));
|
|
eproc_name = eproc_name_buf;
|
|
}
|
|
if (input) {
|
|
KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
|
|
pktap_input_packet(fsw->fsw_ifp, af,
|
|
fsw->fsw_ifp_dlt, pid, proc_name, epid,
|
|
eproc_name, SK_PKT2PH(pkt), NULL, 0,
|
|
IPPROTO_TCP, fe->fe_flowid,
|
|
tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
|
|
}
|
|
} else {
|
|
KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
|
|
pktap_output_packet(fsw->fsw_ifp, af,
|
|
fsw->fsw_ifp_dlt, pid, proc_name, epid,
|
|
eproc_name, SK_PKT2PH(pkt), NULL, 0,
|
|
0, 0, PTH_FLAG_NEXUS_CHAN);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if (DEVELOPMENT || DEBUG)
|
|
static void
|
|
_fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
|
|
int *ret)
|
|
{
|
|
static boolean_t _err35_flag_modified = FALSE;
|
|
|
|
switch (step) {
|
|
case 1:
|
|
if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
|
|
(FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
|
|
fr->fr_flags &= ~FLOWRTF_RESOLVED;
|
|
_err35_flag_modified = TRUE;
|
|
}
|
|
break;
|
|
|
|
case 2:
|
|
if (!_err35_flag_modified) {
|
|
return;
|
|
}
|
|
if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
|
|
m_freem(pkt->pkt_mbuf);
|
|
pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
|
|
pkt->pkt_mbuf = NULL;
|
|
}
|
|
*ret = EJUSTRETURN;
|
|
fr->fr_flags |= FLOWRTF_RESOLVED;
|
|
_err35_flag_modified = FALSE;
|
|
break;
|
|
|
|
default:
|
|
VERIFY(0);
|
|
/* not reached */
|
|
}
|
|
}
|
|
|
|
static void
|
|
_fsw_error36_handler(int step, struct flow_route *fr, int *ret)
|
|
{
|
|
static boolean_t _err36_flag_modified = FALSE;
|
|
|
|
switch (step) {
|
|
case 1:
|
|
if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
|
|
(FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
|
|
fr->fr_flags &= ~FLOWRTF_RESOLVED;
|
|
_err36_flag_modified = TRUE;
|
|
}
|
|
break;
|
|
|
|
case 2:
|
|
if (!_err36_flag_modified) {
|
|
return;
|
|
}
|
|
*ret = ENETUNREACH;
|
|
fr->fr_flags |= FLOWRTF_RESOLVED;
|
|
_err36_flag_modified = FALSE;
|
|
break;
|
|
|
|
default:
|
|
VERIFY(0);
|
|
/* not reached */
|
|
}
|
|
}
|
|
#else /* !DEVELOPMENT && !DEBUG */
|
|
#define _fsw_error35_handler(...)
|
|
#define _fsw_error36_handler(...)
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
|
|
/*
|
|
* Check if the source packet content can fit into the destination
|
|
* ring's packet. Returns TRUE if the source packet can fit.
|
|
* Note: Failures could be caused by misconfigured packet pool sizes,
|
|
* missing packet size check again MTU or if the source packet is from
|
|
* a compat netif and the attached mbuf is larger than MTU due to LRO.
|
|
*/
|
|
static inline boolean_t
|
|
validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
|
|
uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
|
|
uint32_t *copy_len)
|
|
{
|
|
uint32_t tlen = 0;
|
|
uint32_t splen = spkt->pkt_length - skip_l2hlen;
|
|
|
|
if (l2hlen != 0) {
|
|
VERIFY(skip_l2hlen == 0);
|
|
tlen += l2hlen;
|
|
} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
|
|
splen -= ETHER_CRC_LEN;
|
|
}
|
|
|
|
tlen += splen;
|
|
*copy_len = splen;
|
|
|
|
return tlen <= ((__packet_get_buflet_count(dph) *
|
|
PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
|
|
headroom);
|
|
}
|
|
|
|
#if SK_LOG
|
|
/* Hoisted out of line to reduce kernel stack footprint */
|
|
SK_LOG_ATTRIBUTE
|
|
static void
|
|
copy_packet_from_dev_log(struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt, struct proc *p)
|
|
{
|
|
uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
|
|
((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
|
|
SK_VERB_COPY_MBUF : SK_VERB_COPY));
|
|
char *daddr;
|
|
MD_BUFLET_ADDR_ABS(dpkt, daddr);
|
|
SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
|
|
sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
|
|
dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
|
|
(uint32_t)dpkt->pkt_l2_len);
|
|
SK_DF(logflags | SK_VERB_DUMP, "%s",
|
|
sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
|
|
}
|
|
#else
|
|
#define copy_packet_from_dev_log(...)
|
|
#endif /* SK_LOG */
|
|
|
|
|
|
static inline int
|
|
copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt)
|
|
{
|
|
/*
|
|
* source and destination nexus don't share the packet pool
|
|
* sync operation here is to
|
|
* - alloc packet for the rx(dst) ring
|
|
* - copy data/metadata from src packet to dst packet
|
|
* - attach alloc'd packet to rx(dst) ring
|
|
*/
|
|
kern_packet_t dph = SK_PTR_ENCODE(dpkt,
|
|
METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
|
|
kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
|
|
METADATA_SUBTYPE(spkt));
|
|
boolean_t do_cksum_rx;
|
|
uint16_t skip_l2h_len = spkt->pkt_l2_len;
|
|
uint16_t iphlen;
|
|
uint32_t dlen;
|
|
int err;
|
|
|
|
if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
|
|
&dlen))) {
|
|
SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
|
|
PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
|
|
return EINVAL;
|
|
}
|
|
|
|
/* Copy packet metadata */
|
|
_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
|
|
_PKT_COPY(spkt, dpkt);
|
|
ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
|
|
PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
|
|
ASSERT(dpkt->pkt_mbuf == NULL);
|
|
|
|
dpkt->pkt_headroom = 0;
|
|
dpkt->pkt_l2_len = 0;
|
|
|
|
/* don't include IP header from partial sum */
|
|
if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
|
|
iphlen = spkt->pkt_flow_ip_hlen;
|
|
do_cksum_rx = sk_cksum_rx;
|
|
} else {
|
|
iphlen = 0;
|
|
do_cksum_rx = FALSE;
|
|
}
|
|
|
|
/* Copy packet payload */
|
|
if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
|
|
(spkt->pkt_pflags & PKT_F_TRUNCATED)) {
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
|
|
/*
|
|
* Source packet has truncated contents (just enough for
|
|
* the classifer) of an mbuf from the compat driver; copy
|
|
* the entire entire mbuf contents to destination packet.
|
|
*/
|
|
m_adj(spkt->pkt_mbuf, skip_l2h_len);
|
|
ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
|
|
fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
|
|
spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
|
|
} else {
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
|
|
/*
|
|
* Source packet has full contents, either from an mbuf
|
|
* that came up from the compat driver, or because it
|
|
* originated on the native driver; copy to destination.
|
|
*/
|
|
fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
|
|
(spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
|
|
iphlen, 0, FALSE);
|
|
}
|
|
|
|
#if DEBUG || DEVELOPMENT
|
|
if (__improbable(pkt_trailers > 0)) {
|
|
dlen += pkt_add_trailers(dph, dlen, iphlen);
|
|
}
|
|
#endif /* DEBUG || DEVELOPMENT */
|
|
|
|
/* Finalize and attach packet to Rx ring */
|
|
METADATA_ADJUST_LEN(dpkt, 0, 0);
|
|
err = __packet_finalize(dph);
|
|
VERIFY(err == 0);
|
|
|
|
copy_packet_from_dev_log(spkt, dpkt, kernproc);
|
|
|
|
if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
|
|
ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
|
|
mbuf_free(spkt->pkt_mbuf);
|
|
KPKT_CLEAR_MBUF_DATA(spkt);
|
|
} else {
|
|
fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
|
|
}
|
|
|
|
if (__probable(do_cksum_rx != 0)) {
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static struct __kern_packet *
|
|
rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
char *pkt_buf;
|
|
void *l3_hdr;
|
|
uint16_t nfrags, tlen;
|
|
int err = 0;
|
|
|
|
switch (fsw_ip_reass) {
|
|
case FSW_IP_REASS_FORCE_OFF:
|
|
return pkt;
|
|
case FSW_IP_REASS_FORCE_ON:
|
|
break;
|
|
default:
|
|
if (!FSW_NETAGENT_ENABLED(fsw) ||
|
|
flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
|
|
return pkt;
|
|
}
|
|
break;
|
|
}
|
|
|
|
MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
|
|
l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
|
|
|
|
ASSERT(fsw->fsw_ipfm != NULL);
|
|
ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
|
|
|
|
if (pkt->pkt_flow_ip_ver == IPVERSION) {
|
|
err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
|
|
(struct ip *)l3_hdr, &nfrags, &tlen);
|
|
} else {
|
|
ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
|
|
/* we only handle frag header immediately after v6 header */
|
|
err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
|
|
(struct ip6_hdr *)l3_hdr,
|
|
(struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
|
|
&nfrags, &tlen);
|
|
}
|
|
if (__improbable(err != 0)) {
|
|
/* if we get a bad fragment, free it */
|
|
pp_free_packet_single(pkt);
|
|
pkt = NULL;
|
|
} else {
|
|
ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
|
|
}
|
|
|
|
return pkt;
|
|
}
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static void
|
|
rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
|
|
uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
|
|
kern_packet_t ph = SK_PTR_ENCODE(pkt,
|
|
METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
|
|
/*
|
|
* This is the case when the packet is coming in from
|
|
* compat-netif. This packet only has valid metadata
|
|
* and an attached mbuf. We need to copy enough data
|
|
* from the mbuf to the packet buffer for the
|
|
* classifier. Compat netif packet pool is configured
|
|
* with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
|
|
* which is just enough to hold the protocol headers
|
|
* for the flowswitch classifier.
|
|
*/
|
|
|
|
pkt->pkt_headroom = 0;
|
|
METADATA_ADJUST_LEN(pkt, 0, 0);
|
|
/*
|
|
* Copy the initial 128 bytes of the packet for
|
|
* classification.
|
|
* Ethernet(14) + IPv6 header(40) +
|
|
* + IPv6 fragment header(8) +
|
|
* TCP header with options(60).
|
|
*/
|
|
fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
|
|
pkt->pkt_headroom, pkt->pkt_mbuf, 0,
|
|
MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
|
|
FALSE, 0);
|
|
|
|
int err = __packet_finalize_with_mbuf(pkt);
|
|
VERIFY(err == 0);
|
|
}
|
|
|
|
static struct __kern_packet *
|
|
rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
|
|
|
|
if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
|
|
rx_prepare_packet_mbuf(fsw, pkt);
|
|
}
|
|
|
|
return pkt;
|
|
}
|
|
|
|
static struct flow_entry *
|
|
lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
|
|
bool input, struct flow_entry *prev_fe)
|
|
{
|
|
struct flow_key key __sk_aligned(16);
|
|
struct flow_entry *fe = NULL;
|
|
|
|
ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
|
|
flow_pkt2key(pkt, input, &key);
|
|
|
|
if (__probable(prev_fe != NULL &&
|
|
prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
|
|
uint16_t saved_mask = key.fk_mask;
|
|
key.fk_mask = FKMASK_5TUPLE;
|
|
if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
|
|
flow_entry_retain(prev_fe);
|
|
fe = prev_fe;
|
|
} else {
|
|
key.fk_mask = saved_mask;
|
|
}
|
|
}
|
|
|
|
top:
|
|
if (__improbable(fe == NULL)) {
|
|
fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
|
|
}
|
|
|
|
if (__improbable(fe != NULL &&
|
|
(fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
|
|
/* Rx */
|
|
if (input) {
|
|
if (fe->fe_flags & FLOWENTF_PARENT) {
|
|
struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
|
|
if (child_fe != NULL) {
|
|
flow_entry_release(&fe);
|
|
fe = child_fe;
|
|
}
|
|
} else {
|
|
if (!rx_flow_demux_match(fsw, fe, pkt)) {
|
|
flow_entry_release(&fe);
|
|
fe = NULL;
|
|
goto top;
|
|
}
|
|
}
|
|
} else {
|
|
/* Tx */
|
|
if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
|
|
if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
|
|
struct flow_entry *parent_fe = fe;
|
|
fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
|
|
flow_entry_release(&parent_fe);
|
|
} else {
|
|
flow_entry_release(&fe);
|
|
fe = NULL;
|
|
goto top;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
|
|
"%s %s %s \"%s\" fe 0x%llx",
|
|
input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
|
|
sk_proc_name_address(current_proc()),
|
|
fk_as_string(&key, fkbuf, sizeof(fkbuf)),
|
|
SK_KVA(fe));
|
|
|
|
return fe;
|
|
}
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static bool
|
|
pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
|
|
{
|
|
struct nx_flowswitch *fsw = fe->fe_fsw;
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
struct in_ifaddr *ia = NULL;
|
|
struct in_ifaddr *best_ia = NULL;
|
|
struct in6_ifaddr *ia6 = NULL;
|
|
struct in6_ifaddr *best_ia6 = NULL;
|
|
struct ifnet *match_ifp = NULL;
|
|
struct __flow *flow = pkt->pkt_flow;
|
|
bool result = false;
|
|
|
|
ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
|
|
|
|
if (flow->flow_ip_ver == IPVERSION) {
|
|
if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
|
|
INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
|
|
result = true;
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Check for a match in the hash bucket.
|
|
*/
|
|
lck_rw_lock_shared(&in_ifaddr_rwlock);
|
|
TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
|
|
if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
|
|
best_ia = ia;
|
|
match_ifp = ia->ia_ifp;
|
|
|
|
if (match_ifp == ifp) {
|
|
break;
|
|
}
|
|
/*
|
|
* Continue the loop in case there's a exact match with another
|
|
* interface
|
|
*/
|
|
}
|
|
}
|
|
|
|
if (best_ia != NULL) {
|
|
if (match_ifp != ifp && ipforwarding == 0 &&
|
|
(match_ifp->if_family == IFNET_FAMILY_IPSEC ||
|
|
match_ifp->if_family == IFNET_FAMILY_UTUN)) {
|
|
/*
|
|
* Drop when interface address check is strict and forwarding
|
|
* is disabled
|
|
*/
|
|
} else {
|
|
lck_rw_done(&in_ifaddr_rwlock);
|
|
result = true;
|
|
goto done;
|
|
}
|
|
}
|
|
lck_rw_done(&in_ifaddr_rwlock);
|
|
|
|
if (ifp->if_flags & IFF_BROADCAST) {
|
|
/*
|
|
* Check for broadcast addresses.
|
|
*
|
|
* Only accept broadcast packets that arrive via the matching
|
|
* interface. Reception of forwarded directed broadcasts would be
|
|
* handled via ip_forward() and ether_frameout() with the loopback
|
|
* into the stack for SIMPLEX interfaces handled by ether_frameout().
|
|
*/
|
|
struct ifaddr *ifa;
|
|
|
|
ifnet_lock_shared(ifp);
|
|
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
|
|
if (ifa->ifa_addr->sa_family != AF_INET) {
|
|
continue;
|
|
}
|
|
ia = ifatoia(ifa);
|
|
if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
|
|
ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
|
|
ifnet_lock_done(ifp);
|
|
result = true;
|
|
goto done;
|
|
}
|
|
}
|
|
ifnet_lock_done(ifp);
|
|
}
|
|
} else {
|
|
if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
|
|
IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
|
|
IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
|
|
result = true;
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Check for exact addresses in the hash bucket.
|
|
*/
|
|
lck_rw_lock_shared(&in6_ifaddr_rwlock);
|
|
TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) {
|
|
if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) {
|
|
if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
|
|
continue;
|
|
}
|
|
best_ia6 = ia6;
|
|
if (ia6->ia_ifp == ifp) {
|
|
break;
|
|
}
|
|
/*
|
|
* Continue the loop in case there's a exact match with another
|
|
* interface
|
|
*/
|
|
}
|
|
}
|
|
if (best_ia6 != NULL) {
|
|
if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
|
|
(best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
|
|
best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
|
|
/*
|
|
* Drop when interface address check is strict and forwarding
|
|
* is disabled
|
|
*/
|
|
} else {
|
|
lck_rw_done(&in6_ifaddr_rwlock);
|
|
result = true;
|
|
goto done;
|
|
}
|
|
}
|
|
lck_rw_done(&in6_ifaddr_rwlock);
|
|
}
|
|
|
|
/*
|
|
* In forwarding mode, if the destination address
|
|
* of the packet does not match any interface
|
|
* address, it maybe destined to the client device
|
|
*/
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
|
|
"Rx flow does not match interface address");
|
|
done:
|
|
return result;
|
|
}
|
|
|
|
static struct flow_entry *
|
|
rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
|
|
struct flow_entry *prev_fe)
|
|
{
|
|
struct flow_entry *fe;
|
|
|
|
fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
|
|
_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
|
|
if (fe == NULL) {
|
|
FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
|
|
return NULL;
|
|
}
|
|
|
|
if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
|
|
fe->fe_flags & FLOWENTF_LISTENER) &&
|
|
!pkt_is_for_listener(fe, pkt)) {
|
|
FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
|
|
flow_entry_release(&fe);
|
|
return NULL;
|
|
}
|
|
|
|
if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
|
|
FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
|
|
"Rx flow torn down");
|
|
flow_entry_release(&fe);
|
|
fe = NULL;
|
|
}
|
|
|
|
return fe;
|
|
}
|
|
|
|
static inline void
|
|
rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
|
|
struct __kern_packet *pkt)
|
|
{
|
|
if (__improbable(pkt->pkt_flow_ip_is_frag)) {
|
|
fe->fe_rx_frag_count++;
|
|
}
|
|
|
|
/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
|
|
if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
|
|
ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
|
|
TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
|
|
KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
|
|
} else {
|
|
ASSERT(!TAILQ_EMPTY(fes));
|
|
KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
|
|
flow_entry_release(&fe);
|
|
}
|
|
}
|
|
|
|
static void
|
|
tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
|
|
struct __kern_packet *pkt)
|
|
{
|
|
/* record frag continuation */
|
|
if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
|
|
ASSERT(pkt->pkt_flow_ip_is_frag);
|
|
fe->fe_tx_is_cont_frag = true;
|
|
fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
|
|
} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
|
|
fe->fe_tx_is_cont_frag = false;
|
|
fe->fe_tx_frag_id = 0;
|
|
}
|
|
|
|
if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
|
|
ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
|
|
TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
|
|
KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
|
|
} else {
|
|
ASSERT(!TAILQ_EMPTY(fes));
|
|
KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
|
|
flow_entry_release(&fe);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
|
|
uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
|
|
{
|
|
uint32_t n_pkts = 0;
|
|
slot_idx_t idx, idx_end;
|
|
idx = r->ckr_khead;
|
|
idx_end = r->ckr_rhead;
|
|
|
|
ASSERT(KPKTQ_EMPTY(pktq));
|
|
*n_bytes = 0;
|
|
for (; n_pkts < n_pkts_max && idx != idx_end;
|
|
idx = SLOT_NEXT(idx, r->ckr_lim)) {
|
|
struct __kern_slot_desc *ksd = KR_KSD(r, idx);
|
|
struct __kern_packet *pkt = ksd->sd_pkt;
|
|
|
|
ASSERT(pkt->pkt_nextpkt == NULL);
|
|
KR_SLOT_DETACH_METADATA(r, ksd);
|
|
|
|
_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
|
|
pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
|
|
if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
|
|
|| (pkt->pkt_length == 0)) {
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
pp_free_packet_single(pkt);
|
|
continue;
|
|
}
|
|
n_pkts++;
|
|
*n_bytes += pkt->pkt_length;
|
|
|
|
KPKTQ_ENQUEUE(pktq, pkt);
|
|
}
|
|
r->ckr_khead = idx;
|
|
r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
|
|
}
|
|
|
|
/*
|
|
* This is only for estimating how many packets each GSO packet will need.
|
|
* The number does not need to be exact because any leftover packets allocated
|
|
* will be freed.
|
|
*/
|
|
static uint32_t
|
|
estimate_gso_pkts(struct __kern_packet *pkt)
|
|
{
|
|
packet_tso_flags_t tso_flags;
|
|
uint16_t mss;
|
|
uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
|
|
|
|
tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
|
|
mss = pkt->pkt_proto_seg_sz;
|
|
|
|
if (tso_flags == PACKET_TSO_IPV4) {
|
|
total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
|
|
} else if (tso_flags == PACKET_TSO_IPV6) {
|
|
total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
|
|
}
|
|
if (total_hlen != 0 && mss != 0) {
|
|
total_len = pkt->pkt_length;
|
|
n_pkts = (uint32_t)
|
|
(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
|
|
}
|
|
DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
|
|
uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
|
|
uint32_t, n_pkts);
|
|
return n_pkts;
|
|
}
|
|
|
|
/*
|
|
* This function retrieves a chain of packets of the same type only
|
|
* (GSO or non-GSO).
|
|
*/
|
|
static inline void
|
|
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
|
|
struct __kern_channel_ring *r, uint32_t n_pkts_max,
|
|
struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
|
|
{
|
|
uint32_t n_pkts = 0;
|
|
slot_idx_t idx, idx_end;
|
|
idx = r->ckr_khead;
|
|
idx_end = r->ckr_rhead;
|
|
struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
|
|
boolean_t gso_enabled, gso_required;
|
|
uint32_t gso_pkts;
|
|
|
|
gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
|
|
ASSERT(KPKTQ_EMPTY(pktq));
|
|
*n_bytes = 0;
|
|
for (; n_pkts < n_pkts_max &&
|
|
(!gso_enabled || fsw_gso_batch == 0 ||
|
|
*gso_pkts_estimate < fsw_gso_batch) &&
|
|
idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
|
|
struct __kern_slot_desc *ksd = KR_KSD(r, idx);
|
|
struct __kern_packet *pkt = ksd->sd_pkt;
|
|
|
|
ASSERT(pkt->pkt_nextpkt == NULL);
|
|
|
|
_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
|
|
pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
|
|
if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
|
|
|| (pkt->pkt_length == 0)) {
|
|
KR_SLOT_DETACH_METADATA(r, ksd);
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
pp_free_packet_single(pkt);
|
|
continue;
|
|
}
|
|
if (gso_enabled) {
|
|
gso_pkts = estimate_gso_pkts(pkt);
|
|
|
|
/*
|
|
* We use the first packet to determine what
|
|
* type the subsequent ones need to be (GSO or
|
|
* non-GSO).
|
|
*/
|
|
if (n_pkts == 0) {
|
|
gso_required = (gso_pkts != 0);
|
|
} else {
|
|
if (gso_required != (gso_pkts != 0)) {
|
|
break;
|
|
}
|
|
}
|
|
*gso_pkts_estimate += gso_pkts;
|
|
}
|
|
KR_SLOT_DETACH_METADATA(r, ksd);
|
|
if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
|
|
__packet_set_tx_nx_port(SK_PKT2PH(pkt),
|
|
vpna->vpna_nx_port, vpna->vpna_gencnt);
|
|
}
|
|
n_pkts++;
|
|
*n_bytes += pkt->pkt_length;
|
|
KPKTQ_ENQUEUE(pktq, pkt);
|
|
}
|
|
r->ckr_khead = idx;
|
|
r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
|
|
DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
|
|
ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
|
|
uint32_t, *gso_pkts_estimate);
|
|
}
|
|
|
|
static void
|
|
fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
|
|
struct pktq *pktq)
|
|
{
|
|
#pragma unused(fsw)
|
|
struct __kern_packet *pkt;
|
|
struct __kern_quantum *kqum;
|
|
uint32_t kr_space_avail = 0;
|
|
uint32_t n, n_pkts = 0, n_bytes = 0;
|
|
slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
|
|
|
|
kr_enter(r, TRUE);
|
|
|
|
idx_start = r->ckr_ktail;
|
|
kr_space_avail = kr_available_slots_rxring(r);
|
|
_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
|
|
n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
|
|
_FSW_INJECT_ERROR(41, n, 0, null_func);
|
|
idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
|
|
|
|
idx = idx_start;
|
|
while (idx != idx_end) {
|
|
KPKTQ_DEQUEUE(pktq, pkt);
|
|
kqum = SK_PTR_ADDR_KQUM(pkt);
|
|
kqum->qum_qflags |= QUM_F_FINALIZED;
|
|
n_pkts++;
|
|
n_bytes += pkt->pkt_length;
|
|
KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
|
|
if (__improbable(pkt->pkt_trace_id != 0)) {
|
|
KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
|
|
KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
|
|
}
|
|
idx = SLOT_NEXT(idx, r->ckr_lim);
|
|
}
|
|
|
|
kr_update_stats(r, n_pkts, n_bytes);
|
|
|
|
/*
|
|
* ensure slot attachments are visible before updating the
|
|
* tail pointer
|
|
*/
|
|
os_atomic_thread_fence(seq_cst);
|
|
|
|
r->ckr_ktail = idx_end;
|
|
|
|
kr_exit(r);
|
|
|
|
r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
|
|
r->ckr_name, n_pkts);
|
|
}
|
|
|
|
static void
|
|
pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
|
|
{
|
|
ASSERT(KPKTQ_EMPTY(pktq));
|
|
|
|
for (uint32_t i = 0; i < n_pkts; i++) {
|
|
struct __kern_packet *pkt = pkts[i];
|
|
ASSERT(pkt->pkt_nextpkt == NULL);
|
|
KPKTQ_ENQUEUE(pktq, pkt);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
|
|
*/
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static void
|
|
convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
|
|
struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
|
|
{
|
|
uint32_t tot_cnt;
|
|
unsigned int num_segs = 1;
|
|
struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head;
|
|
uint32_t mhead_cnt, mhead_bufsize;
|
|
uint32_t mhead_waste = 0;
|
|
uint32_t mcnt = 0, mbytes = 0;
|
|
uint32_t largest, max_pkt_len;
|
|
struct __kern_packet *pkt;
|
|
struct kern_pbufpool *pp;
|
|
|
|
tot_cnt = KPKTQ_LEN(pktq);
|
|
ASSERT(tot_cnt > 0);
|
|
mhead_cnt = tot_cnt;
|
|
|
|
/*
|
|
* Opportunistically batch-allocate the mbufs based on the largest
|
|
* packet size we've seen in the recent past. Note that we reset
|
|
* fe_rx_largest_size below if we notice that we're under-utilizing the
|
|
* allocated buffers (thus disabling this batch allocation).
|
|
*/
|
|
largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
|
|
if (__probable(largest != 0)) {
|
|
if (largest <= MCLBYTES) {
|
|
mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
|
|
&num_segs, M_NOWAIT, 1, 0);
|
|
mhead_bufsize = MCLBYTES;
|
|
} else if (largest <= MBIGCLBYTES) {
|
|
mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
|
|
&num_segs, M_NOWAIT, 1, 0);
|
|
mhead_bufsize = MBIGCLBYTES;
|
|
} else if (largest <= M16KCLBYTES) {
|
|
mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
|
|
&num_segs, M_NOWAIT, 1, 0);
|
|
mhead_bufsize = M16KCLBYTES;
|
|
} else if (largest <= M16KCLBYTES * 2) {
|
|
num_segs = 2;
|
|
mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
|
|
&num_segs, M_NOWAIT, 1, 0);
|
|
mhead_bufsize = M16KCLBYTES * 2;
|
|
} else {
|
|
mhead = NULL;
|
|
mhead_bufsize = mhead_cnt = 0;
|
|
}
|
|
} else {
|
|
mhead = NULL;
|
|
mhead_bufsize = mhead_cnt = 0;
|
|
}
|
|
DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
|
|
uint32_t, mhead_cnt, uint32_t, tot_cnt);
|
|
|
|
pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
|
|
max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
|
|
|
|
KPKTQ_FOREACH(pkt, pktq) {
|
|
uint32_t tot_len, len;
|
|
uint16_t pad, llhlen, iphlen;
|
|
boolean_t do_cksum_rx;
|
|
struct mbuf *m;
|
|
int error;
|
|
|
|
llhlen = pkt->pkt_l2_len;
|
|
len = pkt->pkt_length;
|
|
if (__improbable(len > max_pkt_len || llhlen > len)) {
|
|
DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, pkt);
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
|
|
continue;
|
|
}
|
|
/* begin payload on 32-bit boundary; figure out the padding */
|
|
pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
|
|
tot_len = pad + len;
|
|
|
|
/* remember largest packet size */
|
|
if (__improbable(largest < tot_len)) {
|
|
largest = MAX(tot_len, MCLBYTES);
|
|
}
|
|
|
|
/*
|
|
* If the above batch allocation returned partial
|
|
* success, we try a blocking allocation here again.
|
|
*/
|
|
m = mhead;
|
|
if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
|
|
ASSERT(mhead != NULL || mhead_cnt == 0);
|
|
num_segs = 1;
|
|
if (tot_len > M16KCLBYTES) {
|
|
num_segs = 0;
|
|
}
|
|
if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
|
|
&num_segs, &m)) != 0) {
|
|
DTRACE_SKYWALK2(bad__len,
|
|
struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, pkt);
|
|
FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
continue;
|
|
}
|
|
} else {
|
|
mhead = m->m_nextpkt;
|
|
m->m_nextpkt = NULL;
|
|
ASSERT(mhead_cnt != 0);
|
|
--mhead_cnt;
|
|
|
|
/* check if we're underutilizing large buffers */
|
|
if (__improbable(mhead_bufsize > MCLBYTES &&
|
|
tot_len < (mhead_bufsize >> 1))) {
|
|
++mhead_waste;
|
|
}
|
|
/*
|
|
* Clean up unused mbuf.
|
|
* Ony need to do this when we pre-alloc 2x16K mbufs
|
|
*/
|
|
if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
|
|
ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
|
|
struct mbuf *m_extra = m->m_next;
|
|
ASSERT(m_extra != NULL);
|
|
ASSERT(m_extra->m_len == 0);
|
|
ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
|
|
m->m_next = NULL;
|
|
m_freem(m_extra);
|
|
FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
|
|
}
|
|
}
|
|
m->m_data += pad;
|
|
m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
|
|
|
|
/* don't include IP header from partial sum */
|
|
if (__probable((pkt->pkt_qum_qflags &
|
|
QUM_F_FLOW_CLASSIFIED) != 0)) {
|
|
iphlen = pkt->pkt_flow_ip_hlen;
|
|
do_cksum_rx = sk_cksum_rx;
|
|
} else {
|
|
iphlen = 0;
|
|
do_cksum_rx = FALSE;
|
|
}
|
|
|
|
fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
|
|
pkt->pkt_headroom, m, 0, len, do_cksum_rx,
|
|
llhlen + iphlen);
|
|
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
|
|
if (do_cksum_rx) {
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
|
|
}
|
|
#if DEBUG || DEVELOPMENT
|
|
if (__improbable(pkt_trailers > 0)) {
|
|
(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
|
|
}
|
|
#endif /* DEBUG || DEVELOPMENT */
|
|
m_adj(m, llhlen);
|
|
|
|
m->m_pkthdr.rcvif = fsw->fsw_ifp;
|
|
if (__improbable((pkt->pkt_link_flags &
|
|
PKT_LINKF_ETHFCS) != 0)) {
|
|
m->m_flags |= M_HASFCS;
|
|
}
|
|
if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
|
|
m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
|
|
}
|
|
ASSERT(m->m_nextpkt == NULL);
|
|
tail = m;
|
|
*tailp = m;
|
|
tailp = &m->m_nextpkt;
|
|
mcnt++;
|
|
mbytes += m_pktlen(m);
|
|
}
|
|
/* free any leftovers */
|
|
if (__improbable(mhead != NULL)) {
|
|
DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
|
|
ASSERT(mhead_cnt != 0);
|
|
(void) m_freem_list(mhead);
|
|
mhead = NULL;
|
|
mhead_cnt = 0;
|
|
}
|
|
|
|
/* reset if most packets (>50%) are smaller than our batch buffers */
|
|
if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
|
|
DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
|
|
struct flow_entry *, NULL, uint32_t, mhead_waste,
|
|
uint32_t, tot_cnt);
|
|
largest = 0;
|
|
}
|
|
|
|
if (largest != fsw->fsw_rx_largest_size) {
|
|
os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
|
|
}
|
|
|
|
pp_free_pktq(pktq);
|
|
*m_headp = head;
|
|
*m_tailp = tail;
|
|
*cnt = mcnt;
|
|
*bytes = mbytes;
|
|
}
|
|
|
|
/*
|
|
* This function only extracts the mbuf from the packet. The caller frees
|
|
* the packet.
|
|
*/
|
|
static inline struct mbuf *
|
|
convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
struct mbuf *m;
|
|
struct pkthdr *mhdr;
|
|
uint16_t llhlen;
|
|
|
|
m = pkt->pkt_mbuf;
|
|
ASSERT(m != NULL);
|
|
|
|
llhlen = pkt->pkt_l2_len;
|
|
if (llhlen > pkt->pkt_length) {
|
|
m_freem(m);
|
|
KPKT_CLEAR_MBUF_DATA(pkt);
|
|
DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, pkt);
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
|
|
return NULL;
|
|
}
|
|
mhdr = &m->m_pkthdr;
|
|
if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
|
|
PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
|
|
mhdr->csum_flags &= ~CSUM_RX_FLAGS;
|
|
mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
|
|
mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
|
|
mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
|
|
}
|
|
#if DEBUG || DEVELOPMENT
|
|
uint32_t extra = 0;
|
|
if (__improbable(pkt_trailers > 0)) {
|
|
extra = pkt_add_trailers_mbuf(m, llhlen);
|
|
}
|
|
#endif /* DEBUG || DEVELOPMENT */
|
|
m_adj(m, llhlen);
|
|
ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
|
|
KPKT_CLEAR_MBUF_DATA(pkt);
|
|
return m;
|
|
}
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static void
|
|
convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
|
|
struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
|
|
{
|
|
struct __kern_packet *pkt;
|
|
struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
|
|
uint32_t c = 0, b = 0;
|
|
|
|
KPKTQ_FOREACH(pkt, pktq) {
|
|
m = convert_compat_pkt_to_mbuf(fsw, pkt);
|
|
if (__improbable(m == NULL)) {
|
|
continue;
|
|
}
|
|
tail = m;
|
|
*tailp = m;
|
|
tailp = &m->m_nextpkt;
|
|
c++;
|
|
b += m_pktlen(m);
|
|
}
|
|
pp_free_pktq(pktq);
|
|
*m_head = head;
|
|
*m_tail = tail;
|
|
*cnt = c;
|
|
*bytes = b;
|
|
}
|
|
|
|
void
|
|
fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
|
|
uint32_t cnt, uint32_t bytes)
|
|
{
|
|
struct ifnet_stat_increment_param s;
|
|
|
|
bzero(&s, sizeof(s));
|
|
s.packets_in = cnt;
|
|
s.bytes_in = bytes;
|
|
dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
|
|
}
|
|
|
|
void
|
|
fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
|
|
{
|
|
struct mbuf *m_head = NULL, *m_tail = NULL;
|
|
uint32_t cnt = 0, bytes = 0;
|
|
ifnet_fsw_rx_cb_t cb;
|
|
void *cb_arg;
|
|
boolean_t compat;
|
|
|
|
ASSERT(!KPKTQ_EMPTY(pktq));
|
|
if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
|
|
ASSERT(cb != NULL);
|
|
ASSERT(cb_arg != NULL);
|
|
/* callback consumes packets */
|
|
(*cb)(cb_arg, pktq);
|
|
ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
|
|
return;
|
|
}
|
|
|
|
/* All packets in the pktq must have the same type */
|
|
compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
|
|
if (compat) {
|
|
convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
|
|
&bytes);
|
|
} else {
|
|
convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
|
|
&bytes);
|
|
}
|
|
if (__improbable(m_head == NULL)) {
|
|
DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
|
|
return;
|
|
}
|
|
fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
|
|
}
|
|
|
|
void
|
|
fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
|
|
struct __kern_channel_ring *r, struct pktq *pktq)
|
|
{
|
|
fsw_ring_enqueue_pktq(fsw, r, pktq);
|
|
FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
|
|
dp_drop_pktq(fsw, pktq);
|
|
}
|
|
|
|
static struct nexus_adapter *
|
|
flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
struct kern_nexus *nx = fsw->fsw_nx;
|
|
struct nexus_adapter *na = NULL;
|
|
nexus_port_t port = fe->fe_nx_port;
|
|
|
|
if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
|
|
SK_ERR("dev or host ports have no NA");
|
|
return NULL;
|
|
}
|
|
|
|
if (__improbable(!nx_port_is_valid(nx, port))) {
|
|
SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
|
|
if_name(fsw->fsw_ifp), port);
|
|
return NULL;
|
|
}
|
|
|
|
na = nx_port_get_na(nx, port);
|
|
if (__improbable(na == NULL)) {
|
|
FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
|
|
SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
|
|
if_name(fsw->fsw_ifp), port);
|
|
return NULL;
|
|
}
|
|
|
|
if (__improbable(!NA_IS_ACTIVE(na))) {
|
|
FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
|
|
SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
|
|
if_name(fsw->fsw_ifp), port);
|
|
return NULL;
|
|
}
|
|
|
|
if (__improbable(nx_port_is_defunct(nx, port))) {
|
|
FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
|
|
SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
|
|
if_name(fsw->fsw_ifp), port);
|
|
return NULL;
|
|
}
|
|
|
|
return na;
|
|
}
|
|
|
|
static inline struct __kern_channel_ring *
|
|
flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
|
|
{
|
|
struct nexus_vp_adapter *na = NULL;
|
|
struct __kern_channel_ring *r = NULL;
|
|
|
|
na = VPNA(flow_get_na(fsw, fe));
|
|
if (__improbable(na == NULL)) {
|
|
return NULL;
|
|
}
|
|
|
|
switch (txrx) {
|
|
case NR_RX:
|
|
r = &na->vpna_up.na_rx_rings[0];
|
|
break;
|
|
case NR_TX:
|
|
r = &na->vpna_up.na_tx_rings[0];
|
|
break;
|
|
default:
|
|
__builtin_unreachable();
|
|
VERIFY(0);
|
|
}
|
|
|
|
if (__improbable(KR_DROP(r))) {
|
|
FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
|
|
r->ckr_name, SK_KVA(r));
|
|
return NULL;
|
|
}
|
|
|
|
ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
|
|
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (r != NULL) {
|
|
_FSW_INJECT_ERROR(4, r, NULL, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
|
|
return r;
|
|
}
|
|
|
|
struct __kern_channel_ring *
|
|
fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
return flow_get_ring(fsw, fe, NR_RX);
|
|
}
|
|
|
|
static inline struct __kern_channel_ring *
|
|
fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
return flow_get_ring(fsw, fe, NR_TX);
|
|
}
|
|
|
|
static bool
|
|
dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
struct flow_route *fr = fe->fe_route;
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
|
|
if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
|
|
!fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
|
|
fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
|
|
!flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
|
|
/*
|
|
* The source address is no longer around; we want this
|
|
* flow to be nonviable, but that requires holding the lock
|
|
* as writer (which isn't the case now.) Indicate that
|
|
* we need to finalize the nonviable later down below.
|
|
*
|
|
* We also request that the flow route be re-configured,
|
|
* if this is a connected mode flow.
|
|
*
|
|
*/
|
|
if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
|
|
/*
|
|
* fsw_pending_nonviable is a hint for reaper thread;
|
|
* due to the fact that setting fe_want_nonviable and
|
|
* incrementing fsw_pending_nonviable counter is not
|
|
* atomic, let the increment happen first, and the
|
|
* thread losing the CAS does decrement.
|
|
*/
|
|
os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
|
|
if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
|
|
fsw_reap_sched(fsw);
|
|
} else {
|
|
os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
|
|
}
|
|
}
|
|
if (fr != NULL) {
|
|
os_atomic_inc(&fr->fr_want_configure, relaxed);
|
|
}
|
|
}
|
|
|
|
/* if flow was (or is going to be) marked as nonviable, drop it */
|
|
if (__improbable(fe->fe_want_nonviable ||
|
|
(fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
|
|
SK_KVA(fe));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
bool okay;
|
|
okay = dp_flow_route_process(fsw, fe);
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (okay) {
|
|
_FSW_INJECT_ERROR(5, okay, false, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
|
|
return okay;
|
|
}
|
|
|
|
void
|
|
dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
uint32_t flags)
|
|
{
|
|
#pragma unused(flags)
|
|
struct pktq dpkts; /* dst pool alloc'ed packets */
|
|
struct pktq disposed_pkts; /* done src packets */
|
|
struct pktq dropped_pkts; /* dropped src packets */
|
|
struct pktq transferred_pkts; /* dst packet ready for ring */
|
|
struct __kern_packet *pkt, *tpkt;
|
|
struct kern_pbufpool *dpp;
|
|
uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
|
|
uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
|
|
uint16_t buf_array_iter = 0;
|
|
uint32_t cnt, buf_cnt = 0;
|
|
int err;
|
|
|
|
KPKTQ_INIT(&dpkts);
|
|
KPKTQ_INIT(&dropped_pkts);
|
|
KPKTQ_INIT(&disposed_pkts);
|
|
KPKTQ_INIT(&transferred_pkts);
|
|
|
|
if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
|
|
SK_ERR("Rx route bad");
|
|
fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
|
|
FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
|
|
goto done;
|
|
}
|
|
|
|
if (fe->fe_nx_port == FSW_VP_HOST) {
|
|
/*
|
|
* The host ring does not exist anymore so we can't take
|
|
* the enqueue path below. This path should only be hit
|
|
* for the rare tcp fragmentation case.
|
|
*/
|
|
fsw_host_rx(fsw, &fe->fe_rx_pktq);
|
|
return;
|
|
}
|
|
|
|
/* find the ring */
|
|
struct __kern_channel_ring *r;
|
|
r = fsw_flow_get_rx_ring(fsw, fe);
|
|
if (__improbable(r == NULL)) {
|
|
fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
|
|
goto done;
|
|
}
|
|
|
|
/* snoop before L2 is stripped */
|
|
if (__improbable(pktap_total_tap_count != 0)) {
|
|
fsw_snoop(fsw, fe, true);
|
|
}
|
|
|
|
dpp = r->ckr_pp;
|
|
/* batch allocate enough packets */
|
|
err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
|
|
SKMEM_NOSLEEP);
|
|
if (__improbable(err == ENOMEM)) {
|
|
ASSERT(KPKTQ_EMPTY(&dpkts));
|
|
KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
|
|
FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
|
|
SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
|
|
r->ckr_name, SK_KVA(r));
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* estimate total number of buflets for the packet chain.
|
|
*/
|
|
cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
|
|
if (cnt > n_pkts) {
|
|
ASSERT(dpp->pp_max_frags > 1);
|
|
cnt -= n_pkts;
|
|
buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
|
|
err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
|
|
SKMEM_NOSLEEP, false);
|
|
if (__improbable(buf_cnt == 0)) {
|
|
KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
|
|
FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
|
|
SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
|
|
"0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
|
|
goto done;
|
|
}
|
|
err = 0;
|
|
}
|
|
|
|
/* extra processing for user flow */
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
|
|
err = 0;
|
|
KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
|
|
if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
|
|
fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
|
|
} else {
|
|
fe->fe_rx_pktq_bytes = 0;
|
|
}
|
|
err = flow_pkt_track(fe, pkt, true);
|
|
_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
|
|
if (__improbable(err != 0)) {
|
|
SK_ERR("flow_pkt_track failed (err %d)", err);
|
|
FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
|
|
/* if need to trigger RST */
|
|
if (err == ENETRESET) {
|
|
flow_track_abort_tcp(fe, pkt, NULL);
|
|
}
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
/* transfer to dpkt */
|
|
if (pkt->pkt_qum.qum_pp != dpp) {
|
|
struct __kern_buflet *bprev, *bnew;
|
|
struct __kern_packet *dpkt = NULL;
|
|
uint32_t n_bufs, i;
|
|
|
|
KPKTQ_DEQUEUE(&dpkts, dpkt);
|
|
if (__improbable(dpkt == NULL)) {
|
|
FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
|
|
n_bufs--;
|
|
for (i = 0; i < n_bufs; i++) {
|
|
if (__improbable(buf_cnt == 0)) {
|
|
ASSERT(dpp->pp_max_frags > 1);
|
|
buf_array_iter = 0;
|
|
cnt = howmany(fe->fe_rx_pktq_bytes,
|
|
PP_BUF_SIZE_DEF(dpp));
|
|
n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
|
|
if (cnt >= n_pkts) {
|
|
cnt -= n_pkts;
|
|
} else {
|
|
cnt = 0;
|
|
}
|
|
cnt += (n_bufs - i);
|
|
buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
|
|
cnt);
|
|
cnt = buf_cnt;
|
|
err = pp_alloc_buflet_batch(dpp,
|
|
buf_array, &buf_cnt,
|
|
SKMEM_NOSLEEP, false);
|
|
if (__improbable(buf_cnt == 0)) {
|
|
FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
|
|
KPKTQ_ENQUEUE(&dropped_pkts,
|
|
pkt);
|
|
pkt = NULL;
|
|
pp_free_packet_single(dpkt);
|
|
dpkt = NULL;
|
|
SK_ERR("failed to alloc %d "
|
|
"buflets (err %d) for "
|
|
"kr %s, 0x%llu", cnt, err,
|
|
r->ckr_name, SK_KVA(r));
|
|
break;
|
|
}
|
|
err = 0;
|
|
}
|
|
ASSERT(buf_cnt != 0);
|
|
if (i == 0) {
|
|
PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
|
|
}
|
|
bnew = (kern_buflet_t)buf_array[buf_array_iter];
|
|
buf_array[buf_array_iter] = 0;
|
|
buf_array_iter++;
|
|
buf_cnt--;
|
|
VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
|
|
bprev, bnew) == 0);
|
|
bprev = bnew;
|
|
}
|
|
if (__improbable(err != 0)) {
|
|
continue;
|
|
}
|
|
err = copy_packet_from_dev(fsw, pkt, dpkt);
|
|
_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
|
|
if (__improbable(err != 0)) {
|
|
SK_ERR("copy packet failed (err %d)", err);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
pp_free_packet_single(dpkt);
|
|
dpkt = NULL;
|
|
continue;
|
|
}
|
|
KPKTQ_ENQUEUE(&disposed_pkts, pkt);
|
|
pkt = dpkt;
|
|
}
|
|
_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
|
|
_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
|
|
pkt->pkt_policy_id = fe->fe_policy_id;
|
|
pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
|
|
pkt->pkt_transport_protocol = fe->fe_transport_protocol;
|
|
if (pkt->pkt_bufs_cnt > 1) {
|
|
pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
|
|
pkt->pkt_seg_cnt = 1;
|
|
}
|
|
KPKTQ_ENQUEUE(&transferred_pkts, pkt);
|
|
}
|
|
KPKTQ_FINI(&fe->fe_rx_pktq);
|
|
KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
|
|
KPKTQ_FINI(&transferred_pkts);
|
|
|
|
fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
|
|
|
|
done:
|
|
/* Free unused buflets */
|
|
while (buf_cnt > 0) {
|
|
pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
|
|
buf_array[buf_array_iter] = 0;
|
|
buf_array_iter++;
|
|
buf_cnt--;
|
|
}
|
|
dp_free_pktq(fsw, &dpkts);
|
|
dp_free_pktq(fsw, &disposed_pkts);
|
|
dp_drop_pktq(fsw, &dropped_pkts);
|
|
}
|
|
|
|
static inline void
|
|
rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
uint32_t flags)
|
|
{
|
|
ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
|
|
ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
|
|
KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
|
|
|
|
/* flow related processing (default, agg, fpd, etc.) */
|
|
fe->fe_rx_process(fsw, fe, flags);
|
|
|
|
if (__improbable(fe->fe_want_withdraw)) {
|
|
fsw_reap_sched(fsw);
|
|
}
|
|
|
|
KPKTQ_FINI(&fe->fe_rx_pktq);
|
|
}
|
|
|
|
static inline void
|
|
dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
/*
|
|
* We only care about wake packets of flows that belong the flow switch
|
|
* as wake packets for the host stack are handled by the host input
|
|
* function
|
|
*/
|
|
#if (DEBUG || DEVELOPMENT)
|
|
if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
|
|
/*
|
|
* This is a one shot command
|
|
*/
|
|
fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
|
|
|
|
pkt->pkt_pflags |= PKT_F_WAKE_PKT;
|
|
}
|
|
#endif /* (DEBUG || DEVELOPMENT) */
|
|
if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
|
|
if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
|
|
}
|
|
}
|
|
|
|
static void
|
|
_fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
|
|
{
|
|
struct __kern_packet *pkt, *tpkt;
|
|
struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
|
|
struct flow_entry *fe, *prev_fe;
|
|
sa_family_t af;
|
|
struct pktq host_pkts, dropped_pkts;
|
|
int err;
|
|
|
|
KPKTQ_INIT(&host_pkts);
|
|
KPKTQ_INIT(&dropped_pkts);
|
|
|
|
if (__improbable(FSW_QUIESCED(fsw))) {
|
|
DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
|
|
KPKTQ_CONCAT(&dropped_pkts, pktq);
|
|
goto done;
|
|
}
|
|
if (__improbable(fsw->fsw_demux == NULL)) {
|
|
KPKTQ_CONCAT(&dropped_pkts, pktq);
|
|
goto done;
|
|
}
|
|
|
|
prev_fe = NULL;
|
|
KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
|
|
if (__probable(tpkt)) {
|
|
void *baddr;
|
|
MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
|
|
SK_PREFETCH(baddr, 0);
|
|
/* prefetch L3 and L4 flow structs */
|
|
SK_PREFETCHW(tpkt->pkt_flow, 0);
|
|
SK_PREFETCHW(tpkt->pkt_flow, 128);
|
|
}
|
|
|
|
KPKTQ_REMOVE(pktq, pkt);
|
|
|
|
pkt = rx_prepare_packet(fsw, pkt);
|
|
|
|
af = fsw->fsw_demux(fsw, pkt);
|
|
if (__improbable(af == AF_UNSPEC)) {
|
|
KPKTQ_ENQUEUE(&host_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
|
|
_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
|
|
if (__improbable(err != 0)) {
|
|
FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
|
|
KPKTQ_ENQUEUE(&host_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
if (__improbable(pkt->pkt_flow_ip_is_frag)) {
|
|
pkt = rx_process_ip_frag(fsw, pkt);
|
|
if (pkt == NULL) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
|
|
if (__improbable(fe == NULL)) {
|
|
KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
|
|
|
|
dp_rx_process_wake_packet(fsw, pkt);
|
|
|
|
rx_flow_batch_packet(&fes, fe, pkt);
|
|
prev_fe = fe;
|
|
}
|
|
|
|
struct flow_entry *tfe = NULL;
|
|
TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
|
|
rx_flow_process(fsw, fe, 0);
|
|
TAILQ_REMOVE(&fes, fe, fe_rx_link);
|
|
fe->fe_rx_pktq_bytes = 0;
|
|
fe->fe_rx_frag_count = 0;
|
|
flow_entry_release(&fe);
|
|
}
|
|
|
|
if (!KPKTQ_EMPTY(&host_pkts)) {
|
|
fsw_host_rx(fsw, &host_pkts);
|
|
}
|
|
|
|
done:
|
|
dp_drop_pktq(fsw, &dropped_pkts);
|
|
}
|
|
|
|
#if (DEVELOPMENT || DEBUG)
|
|
static void
|
|
fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
|
|
struct __kern_packet *pkt)
|
|
{
|
|
struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
|
|
|
|
lck_mtx_lock_spin(&frt->frt_lock);
|
|
KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
}
|
|
|
|
static void
|
|
fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
|
|
{
|
|
struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
|
|
|
|
ASSERT(frt->frt_thread != THREAD_NULL);
|
|
lck_mtx_lock_spin(&frt->frt_lock);
|
|
ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
|
|
|
|
frt->frt_requests++;
|
|
if (!(frt->frt_flags & FRT_RUNNING)) {
|
|
thread_wakeup((caddr_t)frt);
|
|
}
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
}
|
|
|
|
__attribute__((noreturn))
|
|
static void
|
|
fsw_rps_thread_cont(void *v, wait_result_t w)
|
|
{
|
|
struct fsw_rps_thread *frt = v;
|
|
struct nx_flowswitch *fsw = frt->frt_fsw;
|
|
|
|
lck_mtx_lock(&frt->frt_lock);
|
|
if (__improbable(w == THREAD_INTERRUPTIBLE ||
|
|
(frt->frt_flags & FRT_TERMINATING) != 0)) {
|
|
goto terminate;
|
|
}
|
|
if (KPKTQ_EMPTY(&frt->frt_pktq)) {
|
|
goto done;
|
|
}
|
|
frt->frt_flags |= FRT_RUNNING;
|
|
|
|
for (;;) {
|
|
uint32_t requests = frt->frt_requests;
|
|
struct pktq pkts;
|
|
|
|
KPKTQ_INIT(&pkts);
|
|
KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
|
|
sk_protect_t protect;
|
|
protect = sk_sync_protect();
|
|
FSW_RLOCK(fsw);
|
|
_fsw_receive_locked(fsw, &pkts);
|
|
FSW_RUNLOCK(fsw);
|
|
sk_sync_unprotect(protect);
|
|
|
|
lck_mtx_lock(&frt->frt_lock);
|
|
if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
|
|
requests == frt->frt_requests) {
|
|
frt->frt_requests = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
done:
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
if (!(frt->frt_flags & FRT_TERMINATING)) {
|
|
frt->frt_flags &= ~FRT_RUNNING;
|
|
assert_wait(frt, THREAD_UNINT);
|
|
thread_block_parameter(fsw_rps_thread_cont, frt);
|
|
__builtin_unreachable();
|
|
} else {
|
|
terminate:
|
|
LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
|
|
frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
|
|
frt->frt_flags |= FRT_TERMINATED;
|
|
|
|
if (frt->frt_flags & FRT_TERMINATEBLOCK) {
|
|
thread_wakeup((caddr_t)&frt);
|
|
}
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
|
|
SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
|
|
frt->frt_idx);
|
|
|
|
/* for the extra refcnt from kernel_thread_start() */
|
|
thread_deallocate(current_thread());
|
|
/* this is the end */
|
|
thread_terminate(current_thread());
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
/* must never get here */
|
|
VERIFY(0);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
__attribute__((noreturn))
|
|
static void
|
|
fsw_rps_thread_func(void *v, wait_result_t w)
|
|
{
|
|
#pragma unused(w)
|
|
struct fsw_rps_thread *frt = v;
|
|
struct nx_flowswitch *fsw = frt->frt_fsw;
|
|
|
|
char thread_name[MAXTHREADNAMESIZE];
|
|
bzero(thread_name, sizeof(thread_name));
|
|
(void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
|
|
if_name(fsw->fsw_ifp), frt->frt_idx);
|
|
thread_set_thread_name(frt->frt_thread, thread_name);
|
|
SK_D("%s spawned", thread_name);
|
|
|
|
net_thread_marks_push(NET_THREAD_SYNC_RX);
|
|
assert_wait(frt, THREAD_UNINT);
|
|
(void) thread_block_parameter(fsw_rps_thread_cont, frt);
|
|
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
static void
|
|
fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
|
|
{
|
|
struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
|
|
uint64_t f = (1 * NSEC_PER_MSEC);
|
|
uint64_t s = (1000 * NSEC_PER_SEC);
|
|
uint32_t c = 0;
|
|
|
|
lck_mtx_lock(&frt->frt_lock);
|
|
frt->frt_flags |= FRT_TERMINATING;
|
|
|
|
while (!(frt->frt_flags & FRT_TERMINATED)) {
|
|
uint64_t t = 0;
|
|
nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
|
|
clock_absolutetime_interval_to_deadline(t, &t);
|
|
ASSERT(t != 0);
|
|
|
|
frt->frt_flags |= FRT_TERMINATEBLOCK;
|
|
if (!(frt->frt_flags & FRT_RUNNING)) {
|
|
thread_wakeup_one((caddr_t)frt);
|
|
}
|
|
(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
thread_block(THREAD_CONTINUE_NULL);
|
|
lck_mtx_lock(&frt->frt_lock);
|
|
frt->frt_flags &= ~FRT_TERMINATEBLOCK;
|
|
}
|
|
ASSERT(frt->frt_flags & FRT_TERMINATED);
|
|
lck_mtx_unlock(&frt->frt_lock);
|
|
frt->frt_thread = THREAD_NULL;
|
|
}
|
|
|
|
static void
|
|
fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
|
|
{
|
|
kern_return_t error;
|
|
struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
|
|
lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
|
|
frt->frt_idx = i;
|
|
frt->frt_fsw = fsw;
|
|
error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
|
|
ASSERT(!error);
|
|
KPKTQ_INIT(&frt->frt_pktq);
|
|
}
|
|
|
|
int
|
|
fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
|
|
{
|
|
if (n > FSW_RPS_MAX_NTHREADS) {
|
|
SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
|
|
return EINVAL;
|
|
}
|
|
|
|
FSW_WLOCK(fsw);
|
|
if (n < fsw->fsw_rps_nthreads) {
|
|
for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
|
|
fsw_rps_thread_join(fsw, i);
|
|
}
|
|
fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
|
|
fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
|
|
Z_WAITOK | Z_ZERO | Z_NOFAIL);
|
|
} else if (n > fsw->fsw_rps_nthreads) {
|
|
fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
|
|
fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
|
|
Z_WAITOK | Z_ZERO | Z_NOFAIL);
|
|
for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
|
|
fsw_rps_thread_spawn(fsw, i);
|
|
}
|
|
}
|
|
fsw->fsw_rps_nthreads = n;
|
|
FSW_WUNLOCK(fsw);
|
|
return 0;
|
|
}
|
|
|
|
static uint32_t
|
|
get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
sa_family_t af = fsw->fsw_demux(fsw, pkt);
|
|
if (__improbable(af == AF_UNSPEC)) {
|
|
return 0;
|
|
}
|
|
|
|
flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
|
|
|
|
if (__improbable((pkt->pkt_qum_qflags &
|
|
QUM_F_FLOW_CLASSIFIED) == 0)) {
|
|
return 0;
|
|
}
|
|
|
|
struct flow_key key;
|
|
flow_pkt2key(pkt, true, &key);
|
|
key.fk_mask = FKMASK_5TUPLE;
|
|
|
|
uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
|
|
|
|
return id;
|
|
}
|
|
|
|
#endif /* !DEVELOPMENT && !DEBUG */
|
|
|
|
void
|
|
fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
|
|
{
|
|
FSW_RLOCK(fsw);
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (fsw->fsw_rps_nthreads != 0) {
|
|
struct __kern_packet *pkt, *tpkt;
|
|
bitmap_t map = 0;
|
|
|
|
_CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
|
|
KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
|
|
uint32_t id = get_rps_id(fsw, pkt);
|
|
KPKTQ_REMOVE(pktq, pkt);
|
|
fsw_rps_rx(fsw, id, pkt);
|
|
bitmap_set(&map, id);
|
|
}
|
|
for (int i = bitmap_first(&map, 64); i >= 0;
|
|
i = bitmap_next(&map, i)) {
|
|
fsw_rps_thread_schedule(fsw, i);
|
|
}
|
|
} else
|
|
#endif /* !DEVELOPMENT && !DEBUG */
|
|
{
|
|
_fsw_receive_locked(fsw, pktq);
|
|
}
|
|
FSW_RUNLOCK(fsw);
|
|
}
|
|
|
|
int
|
|
fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts,
|
|
uint32_t n_pkts)
|
|
{
|
|
#pragma unused(handle)
|
|
struct nx_flowswitch *fsw = handle;
|
|
struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
|
|
struct pktq pktq;
|
|
sk_protect_t protect;
|
|
uint32_t i;
|
|
|
|
ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
|
|
|
|
for (i = 0; i < n_pkts; i++) {
|
|
ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
|
|
ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
|
|
kpkts[i] = pkts[i].pktsched_pkt_kpkt;
|
|
}
|
|
|
|
protect = sk_sync_protect();
|
|
KPKTQ_INIT(&pktq);
|
|
pkts_to_pktq(kpkts, n_pkts, &pktq);
|
|
|
|
fsw_receive(fsw, &pktq);
|
|
KPKTQ_FINI(&pktq);
|
|
sk_sync_unprotect(protect);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
|
|
{
|
|
classq_pkt_t p;
|
|
struct netem *ne;
|
|
struct __kern_packet *pkt, *tpkt;
|
|
|
|
ASSERT(fsw->fsw_ifp != NULL);
|
|
ne = fsw->fsw_ifp->if_input_netem;
|
|
ASSERT(ne != NULL);
|
|
KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
|
|
bool pdrop;
|
|
KPKTQ_REMOVE(q, pkt);
|
|
CLASSQ_PKT_INIT_PACKET(&p, pkt);
|
|
netem_enqueue(ne, &p, &pdrop);
|
|
}
|
|
}
|
|
|
|
void
|
|
fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
|
|
struct nexus_pkt_stats *out_stats)
|
|
{
|
|
struct __kern_packet *pkt = pkt_head, *next;
|
|
struct nx_flowswitch *fsw;
|
|
uint32_t n_bytes = 0, n_pkts = 0;
|
|
uint64_t total_pkts = 0, total_bytes = 0;
|
|
struct pktq q;
|
|
|
|
KPKTQ_INIT(&q);
|
|
if (__improbable(devna->na_ifp == NULL ||
|
|
(fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
|
|
SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
|
|
pp_free_packet_chain(pkt_head, NULL);
|
|
return;
|
|
}
|
|
while (pkt != NULL) {
|
|
if (__improbable(pkt->pkt_trace_id != 0)) {
|
|
KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
|
|
KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
|
|
}
|
|
next = pkt->pkt_nextpkt;
|
|
pkt->pkt_nextpkt = NULL;
|
|
|
|
if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
|
|
KPKTQ_ENQUEUE(&q, pkt);
|
|
n_bytes += pkt->pkt_length;
|
|
} else {
|
|
DTRACE_SKYWALK1(non__finalized__drop,
|
|
struct __kern_packet *, pkt);
|
|
FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
|
|
pp_free_packet_single(pkt);
|
|
pkt = NULL;
|
|
}
|
|
n_pkts = KPKTQ_LEN(&q);
|
|
if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
|
|
if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
|
|
fsw_dev_input_netem_enqueue(fsw, &q);
|
|
} else {
|
|
fsw_receive(fsw, &q);
|
|
}
|
|
total_pkts += n_pkts;
|
|
total_bytes += n_bytes;
|
|
n_pkts = 0;
|
|
n_bytes = 0;
|
|
KPKTQ_FINI(&q);
|
|
}
|
|
pkt = next;
|
|
}
|
|
ASSERT(KPKTQ_LEN(&q) == 0);
|
|
FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
|
|
if (out_stats != NULL) {
|
|
out_stats->nps_pkts = total_pkts;
|
|
out_stats->nps_bytes = total_bytes;
|
|
}
|
|
KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
|
|
}
|
|
|
|
static int
|
|
dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt)
|
|
{
|
|
struct mbuf *m = NULL;
|
|
uint32_t bdlen, bdlim, bdoff;
|
|
uint8_t *bdaddr;
|
|
unsigned int one = 1;
|
|
int err = 0;
|
|
|
|
err = mbuf_allocpacket(MBUF_DONTWAIT,
|
|
(fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (m != NULL) {
|
|
_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (__improbable(m == NULL)) {
|
|
FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
|
|
err = ENOBUFS;
|
|
goto done;
|
|
}
|
|
|
|
MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
|
|
if (fsw->fsw_frame_headroom > bdlim) {
|
|
SK_ERR("not enough space in buffer for headroom");
|
|
err = EINVAL;
|
|
goto done;
|
|
}
|
|
|
|
dpkt->pkt_headroom = fsw->fsw_frame_headroom;
|
|
dpkt->pkt_mbuf = m;
|
|
dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
|
|
|
|
/* packet copy into mbuf */
|
|
fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
|
|
METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
|
|
fsw->fsw_frame_headroom, spkt->pkt_length,
|
|
PACKET_HAS_PARTIAL_CHECKSUM(spkt),
|
|
spkt->pkt_csum_tx_start_off);
|
|
FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
|
|
|
|
/* header copy into dpkt buffer for classification */
|
|
kern_packet_t sph = SK_PTR_ENCODE(spkt,
|
|
METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
|
|
kern_packet_t dph = SK_PTR_ENCODE(dpkt,
|
|
METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
|
|
uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
|
|
fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
|
|
sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
|
|
|
|
/*
|
|
* fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
|
|
* buflet baddr m_data always points to the beginning of packet and
|
|
* should represents the same as baddr + headroom
|
|
*/
|
|
ASSERT((uintptr_t)m->m_data ==
|
|
((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
|
|
|
|
done:
|
|
return err;
|
|
}
|
|
|
|
static int
|
|
dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt)
|
|
{
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
|
|
|
|
if (headroom > UINT8_MAX) {
|
|
SK_ERR("headroom too large %d", headroom);
|
|
return ERANGE;
|
|
}
|
|
dpkt->pkt_headroom = (uint8_t)headroom;
|
|
ASSERT((dpkt->pkt_headroom & 0x7) == 0);
|
|
dpkt->pkt_l2_len = 0;
|
|
dpkt->pkt_link_flags = spkt->pkt_link_flags;
|
|
|
|
kern_packet_t sph = SK_PTR_ENCODE(spkt,
|
|
METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
|
|
kern_packet_t dph = SK_PTR_ENCODE(dpkt,
|
|
METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
|
|
fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
|
|
dpkt->pkt_headroom, sph, spkt->pkt_headroom,
|
|
spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
|
|
(spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
|
|
(spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
|
|
(spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
|
|
|
|
FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if SK_LOG
|
|
/* Hoisted out of line to reduce kernel stack footprint */
|
|
SK_LOG_ATTRIBUTE
|
|
static void
|
|
dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
|
|
struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
|
|
{
|
|
struct proc *p = current_proc();
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
|
|
|
|
if (error == ERANGE) {
|
|
SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
|
|
"dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
|
|
(uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
|
|
(uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
|
|
} else if (error == ENOBUFS) {
|
|
SK_DF(logflags, "%s(%d) packet allocation failure",
|
|
sk_proc_name_address(p), sk_proc_pid(p));
|
|
} else if (error == 0) {
|
|
ASSERT(dpkt != NULL);
|
|
char *daddr;
|
|
MD_BUFLET_ADDR_ABS(dpkt, daddr);
|
|
SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
|
|
sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
|
|
dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
|
|
(uint32_t)fsw->fsw_frame_headroom,
|
|
(uint32_t)ifp->if_tx_headroom);
|
|
SK_DF(logflags | SK_VERB_DUMP, "%s",
|
|
sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
|
|
} else {
|
|
SK_DF(logflags, "%s(%d) error %d", error);
|
|
}
|
|
}
|
|
#else
|
|
#define dp_copy_to_dev_log(...)
|
|
#endif /* SK_LOG */
|
|
|
|
static void
|
|
fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
|
|
{
|
|
ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
|
|
ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
|
|
|
|
SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
|
|
/* Copy packet metadata */
|
|
_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
|
|
_PKT_COPY(spkt, dpkt);
|
|
_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
|
|
ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
|
|
!PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
|
|
ASSERT(dpkt->pkt_mbuf == NULL);
|
|
|
|
/* Copy AQM metadata */
|
|
dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
|
|
dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
|
|
_CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
|
|
_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
|
|
_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
|
|
dpkt->pkt_policy_id = spkt->pkt_policy_id;
|
|
dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
|
|
}
|
|
|
|
static int
|
|
dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt)
|
|
{
|
|
const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
uint32_t dev_pkt_len;
|
|
int err = 0;
|
|
|
|
fsw_pkt_copy_metadata(spkt, dpkt);
|
|
switch (fsw->fsw_classq_enq_ptype) {
|
|
case QP_MBUF:
|
|
err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
|
|
break;
|
|
|
|
case QP_PACKET:
|
|
dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
|
|
spkt->pkt_length;
|
|
if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
|
|
err = ERANGE;
|
|
goto done;
|
|
}
|
|
err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
|
|
break;
|
|
|
|
default:
|
|
VERIFY(0);
|
|
__builtin_unreachable();
|
|
}
|
|
done:
|
|
dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
|
|
return err;
|
|
}
|
|
|
|
static int
|
|
dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
|
|
struct __kern_packet *dpkt)
|
|
{
|
|
uint8_t *sbaddr, *dbaddr;
|
|
uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
|
|
uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
|
|
|
|
fsw_pkt_copy_metadata(spkt, dpkt);
|
|
|
|
MD_BUFLET_ADDR_ABS(spkt, sbaddr);
|
|
ASSERT(sbaddr != NULL);
|
|
sbaddr += spkt->pkt_headroom;
|
|
|
|
MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
|
|
ASSERT(dbaddr != NULL);
|
|
dpkt->pkt_headroom = (uint8_t)headroom;
|
|
dbaddr += headroom;
|
|
|
|
pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
|
|
METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
|
|
|
|
/* packet length is set to the full length */
|
|
dpkt->pkt_length = spkt->pkt_length;
|
|
dpkt->pkt_pflags |= PKT_F_TRUNCATED;
|
|
return 0;
|
|
}
|
|
|
|
static struct mbuf *
|
|
convert_pkt_to_mbuf(struct __kern_packet *pkt)
|
|
{
|
|
ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
|
|
ASSERT(pkt->pkt_mbuf != NULL);
|
|
struct mbuf *m = pkt->pkt_mbuf;
|
|
|
|
/* pass additional metadata generated from flow parse/lookup */
|
|
_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
|
|
sizeof(pkt->pkt_flow_token));
|
|
_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
|
|
sizeof(pkt->pkt_flowsrc_token));
|
|
_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
|
|
sizeof(pkt->pkt_flowsrc_fidx));
|
|
m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
|
|
m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
|
|
m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
|
|
m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
|
|
m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
|
|
m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
|
|
m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
|
|
|
|
if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
|
|
m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
|
|
}
|
|
|
|
/* The packet should have a timestamp by the time we get here. */
|
|
m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
|
|
m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
|
|
|
|
m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
|
|
m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
|
|
/* set pkt_hdr so that AQM can find IP header and mark ECN bits */
|
|
m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
|
|
|
|
if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
|
|
m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
|
|
}
|
|
KPKT_CLEAR_MBUF_DATA(pkt);
|
|
|
|
/* mbuf has been consumed, release packet as well */
|
|
ASSERT(pkt->pkt_qum.qum_ksd == NULL);
|
|
pp_free_packet_single(pkt);
|
|
return m;
|
|
}
|
|
|
|
static void
|
|
convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
|
|
struct mbuf **head, struct mbuf **tail,
|
|
uint32_t *cnt, uint32_t *bytes)
|
|
{
|
|
struct __kern_packet *pkt = pkt_list, *next;
|
|
struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL;
|
|
uint32_t c = 0, b = 0;
|
|
|
|
while (pkt != NULL) {
|
|
next = pkt->pkt_nextpkt;
|
|
pkt->pkt_nextpkt = NULL;
|
|
m = convert_pkt_to_mbuf(pkt);
|
|
ASSERT(m != NULL);
|
|
|
|
*m_tailp = m;
|
|
m_tailp = &m->m_nextpkt;
|
|
c++;
|
|
b += m_pktlen(m);
|
|
pkt = next;
|
|
}
|
|
if (head != NULL) {
|
|
*head = m_head;
|
|
}
|
|
if (tail != NULL) {
|
|
*tail = m;
|
|
}
|
|
if (cnt != NULL) {
|
|
*cnt = c;
|
|
}
|
|
if (bytes != NULL) {
|
|
*bytes = b;
|
|
}
|
|
}
|
|
|
|
SK_NO_INLINE_ATTRIBUTE
|
|
static int
|
|
classq_enqueue_flow_single(struct nx_flowswitch *fsw,
|
|
struct __kern_packet *pkt)
|
|
{
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
boolean_t pkt_drop = FALSE;
|
|
int err;
|
|
|
|
FSW_LOCK_ASSERT_HELD(fsw);
|
|
ASSERT(fsw->fsw_classq_enabled);
|
|
ASSERT(pkt->pkt_flow_token != 0);
|
|
fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
|
|
1, pkt->pkt_length);
|
|
|
|
if (__improbable(pkt->pkt_trace_id != 0)) {
|
|
KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
|
|
KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
|
|
}
|
|
|
|
switch (fsw->fsw_classq_enq_ptype) {
|
|
case QP_MBUF: { /* compat interface */
|
|
struct mbuf *m;
|
|
|
|
m = convert_pkt_to_mbuf(pkt);
|
|
ASSERT(m != NULL);
|
|
pkt = NULL;
|
|
|
|
/* ifnet_enqueue consumes mbuf */
|
|
err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
|
|
m = NULL;
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (__improbable(!pkt_drop)) {
|
|
_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (pkt_drop) {
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
|
|
}
|
|
break;
|
|
}
|
|
case QP_PACKET: { /* native interface */
|
|
/* ifnet_enqueue consumes packet */
|
|
err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
|
|
pkt = NULL;
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (__improbable(!pkt_drop)) {
|
|
_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (pkt_drop) {
|
|
FSW_STATS_INC(FSW_STATS_DROP);
|
|
FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
err = EINVAL;
|
|
VERIFY(0);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
static int
|
|
classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
|
|
struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
|
|
uint32_t cnt, uint32_t bytes)
|
|
{
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
boolean_t pkt_drop = FALSE;
|
|
uint32_t svc;
|
|
int err;
|
|
|
|
FSW_LOCK_ASSERT_HELD(fsw);
|
|
ASSERT(fsw->fsw_classq_enabled);
|
|
ASSERT(pkt_head->pkt_flow_token != 0);
|
|
|
|
/*
|
|
* All packets in the flow should have the same svc.
|
|
*/
|
|
svc = pkt_head->pkt_svc_class;
|
|
fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
|
|
|
|
switch (fsw->fsw_classq_enq_ptype) {
|
|
case QP_MBUF: { /* compat interface */
|
|
struct mbuf *m_head = NULL, *m_tail = NULL;
|
|
uint32_t c = 0, b = 0;
|
|
|
|
convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
|
|
ASSERT(m_head != NULL && m_tail != NULL);
|
|
ASSERT(c == cnt);
|
|
ASSERT(b == bytes);
|
|
pkt_head = NULL;
|
|
|
|
/* ifnet_enqueue consumes mbuf */
|
|
err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
|
|
bytes, FALSE, &pkt_drop);
|
|
m_head = NULL;
|
|
m_tail = NULL;
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (__improbable(!pkt_drop)) {
|
|
_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (pkt_drop) {
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
|
|
cnt);
|
|
}
|
|
break;
|
|
}
|
|
case QP_PACKET: { /* native interface */
|
|
/* ifnet_enqueue consumes packet */
|
|
err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
|
|
bytes, FALSE, &pkt_drop);
|
|
pkt_head = NULL;
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (__improbable(!pkt_drop)) {
|
|
_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (pkt_drop) {
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
|
|
cnt);
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
err = EINVAL;
|
|
VERIFY(0);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
/*
|
|
* This code path needs to be kept for interfaces without logical link support.
|
|
*/
|
|
static void
|
|
classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
bool chain, uint32_t cnt, uint32_t bytes)
|
|
{
|
|
bool flowadv_is_set = false;
|
|
struct __kern_packet *pkt, *tail, *tpkt;
|
|
flowadv_idx_t flow_adv_idx;
|
|
bool flowadv_cap;
|
|
flowadv_token_t flow_adv_token;
|
|
int err;
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
|
|
if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
|
|
|
|
if (chain) {
|
|
pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
|
|
tail = KPKTQ_LAST(&fe->fe_tx_pktq);
|
|
KPKTQ_INIT(&fe->fe_tx_pktq);
|
|
if (pkt == NULL) {
|
|
return;
|
|
}
|
|
flow_adv_idx = pkt->pkt_flowsrc_fidx;
|
|
flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
|
|
flow_adv_token = pkt->pkt_flow_token;
|
|
|
|
err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
|
|
|
|
/* set flow advisory if needed */
|
|
if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
|
|
flowadv_cap)) {
|
|
flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
|
|
flow_adv_idx, flow_adv_token);
|
|
}
|
|
DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
|
|
bool, flowadv_is_set);
|
|
} else {
|
|
uint32_t c = 0, b = 0;
|
|
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
|
|
KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
|
|
|
|
flow_adv_idx = pkt->pkt_flowsrc_fidx;
|
|
flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
|
|
flow_adv_token = pkt->pkt_flow_token;
|
|
|
|
c++;
|
|
b += pkt->pkt_length;
|
|
err = classq_enqueue_flow_single(fsw, pkt);
|
|
|
|
/* set flow advisory if needed */
|
|
if (__improbable(!flowadv_is_set &&
|
|
((err == EQFULL || err == EQSUSPENDED) &&
|
|
flowadv_cap))) {
|
|
flowadv_is_set = na_flowadv_set(
|
|
flow_get_na(fsw, fe), flow_adv_idx,
|
|
flow_adv_token);
|
|
}
|
|
}
|
|
ASSERT(c == cnt);
|
|
ASSERT(b == bytes);
|
|
DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
|
|
bool, flowadv_is_set);
|
|
}
|
|
|
|
/* notify flow advisory event */
|
|
if (__improbable(flowadv_is_set)) {
|
|
struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
|
|
if (__probable(r)) {
|
|
na_flowadv_event(r);
|
|
SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
|
|
"%s(%d) notified of flow update",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()));
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Logical link code path
|
|
*/
|
|
static void
|
|
classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
bool chain, uint32_t cnt, uint32_t bytes)
|
|
{
|
|
#pragma unused(chain)
|
|
struct __kern_packet *pkt, *tail;
|
|
flowadv_idx_t flow_adv_idx;
|
|
bool flowadv_is_set = false;
|
|
bool flowadv_cap;
|
|
flowadv_token_t flow_adv_token;
|
|
uint32_t flowctl = 0, dropped = 0;
|
|
int err;
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
|
|
if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
|
|
|
|
pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
|
|
tail = KPKTQ_LAST(&fe->fe_tx_pktq);
|
|
KPKTQ_INIT(&fe->fe_tx_pktq);
|
|
if (pkt == NULL) {
|
|
return;
|
|
}
|
|
flow_adv_idx = pkt->pkt_flowsrc_fidx;
|
|
flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
|
|
flow_adv_token = pkt->pkt_flow_token;
|
|
|
|
err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
|
|
&flowctl, &dropped);
|
|
|
|
if (__improbable(err != 0)) {
|
|
/* set flow advisory if needed */
|
|
if (flowctl > 0 && flowadv_cap) {
|
|
flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
|
|
flow_adv_idx, flow_adv_token);
|
|
|
|
/* notify flow advisory event */
|
|
if (flowadv_is_set) {
|
|
struct __kern_channel_ring *r =
|
|
fsw_flow_get_tx_ring(fsw, fe);
|
|
if (__probable(r)) {
|
|
na_flowadv_event(r);
|
|
SK_DF(SK_VERB_FLOW_ADVISORY |
|
|
SK_VERB_TX,
|
|
"%s(%d) notified of flow update",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()));
|
|
}
|
|
}
|
|
}
|
|
if (dropped > 0) {
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
|
|
STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
|
|
dropped);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
|
|
{
|
|
#pragma unused(fsw)
|
|
/* finalize here; no more changes to buflets after classq */
|
|
if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
|
|
kern_packet_t ph = SK_PTR_ENCODE(pkt,
|
|
METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
|
|
int err = __packet_finalize(ph);
|
|
VERIFY(err == 0);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
struct flow_route *fr = fe->fe_route;
|
|
int err;
|
|
|
|
ASSERT(fr != NULL);
|
|
|
|
if (__improbable(!dp_flow_route_process(fsw, fe))) {
|
|
return false;
|
|
}
|
|
if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
|
|
flow_qset_select_dynamic(fsw, fe, TRUE);
|
|
}
|
|
|
|
_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
|
|
_fsw_error35_handler, 1, fr, NULL, NULL);
|
|
_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
|
|
_fsw_error36_handler, 1, fr, NULL);
|
|
|
|
/*
|
|
* See if we need to resolve the flow route; note the test against
|
|
* fr_flags here is done without any lock for performance. Thus
|
|
* it's possible that we race against the thread performing route
|
|
* event updates for a packet (which is OK). In any case we should
|
|
* not have any assertion on fr_flags value(s) due to the lack of
|
|
* serialization.
|
|
*/
|
|
if (fr->fr_flags & FLOWRTF_RESOLVED) {
|
|
goto frame;
|
|
}
|
|
|
|
struct __kern_packet *pkt, *tpkt;
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
|
|
err = fsw->fsw_resolve(fsw, fr, pkt);
|
|
_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
|
|
_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
|
|
/*
|
|
* If resolver returns EJUSTRETURN then we drop the pkt as the
|
|
* resolver should have converted the pkt into mbuf (or
|
|
* detached the attached mbuf from pkt) and added it to the
|
|
* llinfo queue. If we do have a cached llinfo, then proceed
|
|
* to using it even though it may be stale (very unlikely)
|
|
* while the resolution is in progress.
|
|
* Otherwise, any other error results in dropping pkt.
|
|
*/
|
|
if (err == EJUSTRETURN) {
|
|
KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
|
|
pp_free_packet_single(pkt);
|
|
FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
|
|
continue;
|
|
} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
|
|
/* use existing llinfo */
|
|
FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
|
|
} else if (err != 0) {
|
|
KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
|
|
pp_free_packet_single(pkt);
|
|
FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
frame:
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
|
|
if (fsw->fsw_frame != NULL) {
|
|
fsw->fsw_frame(fsw, fr, pkt);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
#pragma unused(fsw)
|
|
struct __kern_packet *pkt, *tpkt;
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
|
|
KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
|
|
/* listener is only allowed TCP RST */
|
|
if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
|
|
(pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
|
|
flow_track_abort_tcp(fe, NULL, pkt);
|
|
} else {
|
|
char *addr;
|
|
MD_BUFLET_ADDR_ABS(pkt, addr);
|
|
SK_ERR("listener flow sends non-RST packet %s",
|
|
sk_dump(sk_proc_name_address(current_proc()),
|
|
addr, pkt->pkt_length, 128, NULL, 0));
|
|
}
|
|
pp_free_packet_single(pkt);
|
|
}
|
|
}
|
|
|
|
static void
|
|
fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
|
|
volatile uint64_t *rt_ts, ifnet_t ifp)
|
|
{
|
|
struct timespec now;
|
|
uint64_t now_nsec = 0;
|
|
|
|
if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
|
|
nanouptime(&now);
|
|
net_timernsec(&now, &now_nsec);
|
|
pkt->pkt_timestamp = now_nsec;
|
|
}
|
|
pkt->pkt_pflags &= ~PKT_F_TS_VALID;
|
|
|
|
/*
|
|
* If the packet service class is not background,
|
|
* update the timestamps on the interface, as well as
|
|
* the ones in nexus-wide advisory to indicate recent
|
|
* activity on a foreground flow.
|
|
*/
|
|
if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
|
|
ifp->if_fg_sendts = (uint32_t)_net_uptime;
|
|
if (fg_ts != NULL) {
|
|
*fg_ts = _net_uptime;
|
|
}
|
|
}
|
|
if (pkt->pkt_pflags & PKT_F_REALTIME) {
|
|
ifp->if_rt_sendts = (uint32_t)_net_uptime;
|
|
if (rt_ts != NULL) {
|
|
*rt_ts = _net_uptime;
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool
|
|
fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
|
|
{
|
|
return fsw_chain_enqueue != 0 &&
|
|
fsw->fsw_ifp->if_output_netem == NULL &&
|
|
(fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
|
|
gso_enabled;
|
|
}
|
|
|
|
void
|
|
dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
uint32_t flags)
|
|
{
|
|
struct pktq dropped_pkts;
|
|
bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
|
|
uint32_t cnt = 0, bytes = 0;
|
|
volatile struct sk_nexusadv *nxadv = NULL;
|
|
volatile uint64_t *fg_ts = NULL;
|
|
volatile uint64_t *rt_ts = NULL;
|
|
uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
|
|
|
|
KPKTQ_INIT(&dropped_pkts);
|
|
ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
|
|
if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
|
|
dp_listener_flow_tx_process(fsw, fe);
|
|
return;
|
|
}
|
|
if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
|
|
SK_RDERR(5, "Tx route bad");
|
|
FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
|
|
KPKTQ_LEN(&fe->fe_tx_pktq));
|
|
KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
|
|
goto done;
|
|
}
|
|
chain = fsw_chain_enqueue_enabled(fsw, gso);
|
|
if (chain) {
|
|
nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
|
|
if (nxadv != NULL) {
|
|
fg_ts = &nxadv->nxadv_fg_sendts;
|
|
rt_ts = &nxadv->nxadv_rt_sendts;
|
|
}
|
|
}
|
|
struct __kern_packet *pkt, *tpkt;
|
|
KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
|
|
int err = 0;
|
|
|
|
err = flow_pkt_track(fe, pkt, false);
|
|
if (__improbable(err != 0)) {
|
|
SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
|
|
KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
|
|
pkt->pkt_transport_protocol = fe->fe_transport_protocol;
|
|
|
|
/* set AQM related values for outgoing packet */
|
|
if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
|
|
pkt->pkt_pflags |= PKT_F_FLOW_ADV;
|
|
pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
|
|
pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
|
|
} else {
|
|
pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
|
|
}
|
|
_UUID_CLEAR(pkt->pkt_flow_id);
|
|
pkt->pkt_flow_token = fe->fe_flowid;
|
|
pkt->pkt_pflags |= PKT_F_FLOW_ID;
|
|
pkt->pkt_qset_idx = qset_idx;
|
|
pkt->pkt_policy_id = fe->fe_policy_id;
|
|
pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
|
|
|
|
/*
|
|
* The same code is exercised per packet for the non-chain case
|
|
* (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
|
|
* re-walking the chain later.
|
|
*/
|
|
if (chain) {
|
|
fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
|
|
}
|
|
/* mark packet tos/svc_class */
|
|
fsw_qos_mark(fsw, fe, pkt);
|
|
|
|
tx_finalize_packet(fsw, pkt);
|
|
bytes += pkt->pkt_length;
|
|
cnt++;
|
|
}
|
|
|
|
/* snoop after it's finalized */
|
|
if (__improbable(pktap_total_tap_count != 0)) {
|
|
fsw_snoop(fsw, fe, false);
|
|
}
|
|
if (fe->fe_qset != NULL) {
|
|
classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
|
|
} else {
|
|
classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
|
|
}
|
|
done:
|
|
dp_drop_pktq(fsw, &dropped_pkts);
|
|
}
|
|
|
|
static struct flow_entry *
|
|
tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
|
|
struct flow_entry *prev_fe, struct __kern_packet *pkt)
|
|
{
|
|
ASSERT(!pkt->pkt_flow_ip_is_first_frag);
|
|
|
|
if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
|
|
SK_ERR("%s(%d) invalid zero fragment id",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()));
|
|
return NULL;
|
|
}
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
|
|
"%s(%d) continuation frag, id %u",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()),
|
|
pkt->pkt_flow_ip_frag_id);
|
|
if (__improbable(prev_fe == NULL ||
|
|
!prev_fe->fe_tx_is_cont_frag)) {
|
|
SK_ERR("%s(%d) unexpected continuation frag",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()),
|
|
pkt->pkt_flow_ip_frag_id);
|
|
FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
|
|
return NULL;
|
|
}
|
|
if (__improbable(pkt->pkt_flow_ip_frag_id !=
|
|
prev_fe->fe_tx_frag_id)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
|
|
SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
|
|
sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()),
|
|
pkt->pkt_flow_ip_frag_id,
|
|
prev_fe->fe_tx_frag_id);
|
|
return NULL;
|
|
}
|
|
|
|
return prev_fe;
|
|
}
|
|
|
|
static struct flow_entry *
|
|
tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
|
|
struct flow_entry *prev_fe)
|
|
{
|
|
struct flow_entry *fe;
|
|
|
|
fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
|
|
if (__improbable(fe == NULL)) {
|
|
goto done;
|
|
}
|
|
|
|
if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
|
|
SK_RDERR(5, "Tx flow torn down");
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
|
|
flow_entry_release(&fe);
|
|
goto done;
|
|
}
|
|
|
|
_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
|
|
null_func);
|
|
|
|
if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
|
|
uuid_string_t flow_id_str, pkt_id_str;
|
|
sk_uuid_unparse(fe->fe_uuid, flow_id_str);
|
|
sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
|
|
SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
|
|
flow_entry_release(&fe);
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
|
|
}
|
|
|
|
done:
|
|
return fe;
|
|
}
|
|
|
|
static inline void
|
|
tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
|
|
uint32_t flags)
|
|
{
|
|
ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
|
|
ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
|
|
|
|
SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
|
|
KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
|
|
|
|
/* flow related processing (default, agg, etc.) */
|
|
fe->fe_tx_process(fsw, fe, flags);
|
|
|
|
KPKTQ_FINI(&fe->fe_tx_pktq);
|
|
}
|
|
|
|
#if SK_LOG
|
|
static void
|
|
dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
|
|
{
|
|
char *pkt_buf;
|
|
MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
|
|
SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
|
|
sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
|
|
pkt->pkt_length, 128, NULL, 0));
|
|
}
|
|
#else /* !SK_LOG */
|
|
#define dp_tx_log_pkt(...)
|
|
#endif /* !SK_LOG */
|
|
|
|
static inline struct ifnet *
|
|
fsw_datamov_begin(struct nx_flowswitch *fsw)
|
|
{
|
|
struct ifnet *ifp;
|
|
|
|
ifp = fsw->fsw_ifp;
|
|
if (!ifnet_datamov_begin(ifp)) {
|
|
DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
|
|
return NULL;
|
|
}
|
|
return ifp;
|
|
}
|
|
|
|
static inline void
|
|
fsw_datamov_end(struct nx_flowswitch *fsw)
|
|
{
|
|
ifnet_datamov_end(fsw->fsw_ifp);
|
|
}
|
|
|
|
static void
|
|
dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
|
|
{
|
|
struct __kern_packet *spkt, *pkt;
|
|
struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
|
|
struct flow_entry *fe, *prev_fe;
|
|
struct pktq dropped_pkts, dpktq;
|
|
struct nexus_adapter *dev_na;
|
|
struct kern_pbufpool *dev_pp;
|
|
struct ifnet *ifp = NULL;
|
|
sa_family_t af;
|
|
uint32_t n_pkts, n_flows = 0;
|
|
boolean_t do_pacing = FALSE;
|
|
|
|
int err;
|
|
KPKTQ_INIT(&dpktq);
|
|
KPKTQ_INIT(&dropped_pkts);
|
|
n_pkts = KPKTQ_LEN(spktq);
|
|
|
|
FSW_RLOCK(fsw);
|
|
if (__improbable(FSW_QUIESCED(fsw))) {
|
|
DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
|
|
SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
|
|
KPKTQ_CONCAT(&dropped_pkts, spktq);
|
|
goto done;
|
|
}
|
|
dev_na = fsw->fsw_dev_ch->ch_na;
|
|
if (__improbable(dev_na == NULL)) {
|
|
SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
|
|
FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
|
|
KPKTQ_CONCAT(&dropped_pkts, spktq);
|
|
goto done;
|
|
}
|
|
ifp = fsw_datamov_begin(fsw);
|
|
if (ifp == NULL) {
|
|
SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
|
|
KPKTQ_CONCAT(&dropped_pkts, spktq);
|
|
goto done;
|
|
}
|
|
|
|
/* batch allocate enough packets */
|
|
dev_pp = na_kr_get_pp(dev_na, NR_TX);
|
|
|
|
err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
|
|
NULL, SKMEM_NOSLEEP);
|
|
#if DEVELOPMENT || DEBUG
|
|
if (__probable(err != ENOMEM)) {
|
|
_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
if (__improbable(err == ENOMEM)) {
|
|
ASSERT(KPKTQ_EMPTY(&dpktq));
|
|
KPKTQ_CONCAT(&dropped_pkts, spktq);
|
|
FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
|
|
SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
|
|
goto done;
|
|
} else if (__improbable(err == EAGAIN)) {
|
|
FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
|
|
(n_pkts - KPKTQ_LEN(&dpktq)));
|
|
FSW_STATS_ADD(FSW_STATS_DROP,
|
|
(n_pkts - KPKTQ_LEN(&dpktq)));
|
|
}
|
|
|
|
n_pkts = KPKTQ_LEN(&dpktq);
|
|
prev_fe = NULL;
|
|
KPKTQ_FOREACH(spkt, spktq) {
|
|
if (n_pkts == 0) {
|
|
break;
|
|
}
|
|
--n_pkts;
|
|
|
|
KPKTQ_DEQUEUE(&dpktq, pkt);
|
|
ASSERT(pkt != NULL);
|
|
err = dp_copy_to_dev(fsw, spkt, pkt);
|
|
if (__improbable(err != 0)) {
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0);
|
|
af = fsw_ip_demux(fsw, pkt);
|
|
if (__improbable(af == AF_UNSPEC)) {
|
|
dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
|
|
FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
err = flow_pkt_classify(pkt, ifp, af, false);
|
|
if (__improbable(err != 0)) {
|
|
dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
|
|
if (__improbable(pkt->pkt_flow_ip_is_frag &&
|
|
!pkt->pkt_flow_ip_is_first_frag)) {
|
|
fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
|
|
if (__probable(fe != NULL)) {
|
|
flow_entry_retain(fe);
|
|
goto flow_batch;
|
|
} else {
|
|
FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
fe = tx_lookup_flow(fsw, pkt, prev_fe);
|
|
if (__improbable(fe == NULL)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
|
|
KPKTQ_ENQUEUE(&dropped_pkts, pkt);
|
|
prev_fe = NULL;
|
|
continue;
|
|
}
|
|
flow_batch:
|
|
tx_flow_batch_packet(&fes, fe, pkt);
|
|
prev_fe = fe;
|
|
}
|
|
|
|
struct flow_entry *tfe = NULL;
|
|
TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
|
|
tx_flow_process(fsw, fe, 0);
|
|
TAILQ_REMOVE(&fes, fe, fe_tx_link);
|
|
fe->fe_tx_is_cont_frag = false;
|
|
fe->fe_tx_frag_id = 0;
|
|
flow_entry_release(&fe);
|
|
n_flows++;
|
|
}
|
|
|
|
done:
|
|
FSW_RUNLOCK(fsw);
|
|
if (n_flows > 0) {
|
|
netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
|
|
}
|
|
if (ifp != NULL) {
|
|
fsw_datamov_end(fsw);
|
|
}
|
|
dp_drop_pktq(fsw, &dropped_pkts);
|
|
KPKTQ_FINI(&dropped_pkts);
|
|
KPKTQ_FINI(&dpktq);
|
|
}
|
|
|
|
static sa_family_t
|
|
get_tso_af(struct __kern_packet *pkt)
|
|
{
|
|
packet_tso_flags_t tso_flags;
|
|
|
|
tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
|
|
if (tso_flags == PACKET_TSO_IPV4) {
|
|
return AF_INET;
|
|
} else if (tso_flags == PACKET_TSO_IPV6) {
|
|
return AF_INET6;
|
|
} else {
|
|
panic("invalid tso flags: 0x%x\n", tso_flags);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr,
|
|
uint16_t payload_sz)
|
|
{
|
|
struct tcphdr *tcp = tcphdr;
|
|
|
|
DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
|
|
void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
|
|
pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
|
|
pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
|
|
pkt->pkt_flow_tcp_flags = tcp->th_flags;
|
|
pkt->pkt_flow_tcp_seq = tcp->th_seq;
|
|
pkt->pkt_flow_ulen = payload_sz;
|
|
}
|
|
|
|
static int
|
|
do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
|
|
struct __kern_packet *first_pkt, struct pktq *dev_pktq,
|
|
struct pktq *gso_pktq)
|
|
{
|
|
ifnet_t ifp = fsw->fsw_ifp;
|
|
struct __kern_packet *pkt = first_pkt;
|
|
uint8_t proto = pkt->pkt_flow_ip_proto;
|
|
uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
|
|
uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
|
|
uint16_t total_hlen = ip_hlen + tcp_hlen;
|
|
uint16_t mtu = (uint16_t)ifp->if_mtu;
|
|
uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
|
|
uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
|
|
uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
|
|
kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
|
|
uint8_t *orig_pkt_baddr;
|
|
struct tcphdr *tcp;
|
|
struct ip *ip;
|
|
struct ip6_hdr *ip6;
|
|
uint32_t tcp_seq;
|
|
uint16_t ipid;
|
|
uint32_t pseudo_hdr_csum, bufsz;
|
|
|
|
ASSERT(headroom <= UINT8_MAX);
|
|
if (proto != IPPROTO_TCP) {
|
|
SK_ERR("invalid proto: %d", proto);
|
|
DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
|
|
fsw, ifnet_t, ifp, uint8_t, proto);
|
|
return EINVAL;
|
|
}
|
|
if (mss == 0 || mss > (mtu - total_hlen)) {
|
|
SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
|
|
mss, mtu, total_hlen);
|
|
DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
|
|
fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
|
|
uint32_t, total_hlen);
|
|
return EINVAL;
|
|
}
|
|
bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
|
|
if ((headroom + total_hlen + mss) > bufsz) {
|
|
SK_ERR("invalid args: headroom %d, total_hlen %d, "
|
|
"mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
|
|
DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
|
|
fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
|
|
total_hlen, uint16_t, mss, uint32_t, bufsz);
|
|
return EINVAL;
|
|
}
|
|
n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
|
|
|
|
ASSERT(pkt->pkt_headroom == headroom);
|
|
ASSERT(pkt->pkt_length == total_len);
|
|
ASSERT(pkt->pkt_l2_len == 0);
|
|
ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
|
|
ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
|
|
pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
|
|
pkt->pkt_proto_seg_sz = 0;
|
|
pkt->pkt_csum_flags = 0;
|
|
MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
|
|
orig_pkt_baddr += orig_pkt->pkt_headroom;
|
|
|
|
if (af == AF_INET) {
|
|
ip = (struct ip *)pkt->pkt_flow_ip_hdr;
|
|
tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
|
|
ipid = ip->ip_id;
|
|
pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
|
|
pkt->pkt_flow_ipv4_dst.s_addr, 0);
|
|
} else {
|
|
ASSERT(af == AF_INET6);
|
|
tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
|
|
pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
|
|
&pkt->pkt_flow_ipv6_dst, 0);
|
|
}
|
|
tcp_seq = ntohl(tcp->th_seq);
|
|
|
|
for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
|
|
off += payload_sz) {
|
|
uint8_t *baddr, *baddr0;
|
|
uint32_t partial;
|
|
|
|
if (pkt == NULL) {
|
|
n++;
|
|
KPKTQ_DEQUEUE(dev_pktq, pkt);
|
|
ASSERT(pkt != NULL);
|
|
}
|
|
MD_BUFLET_ADDR_ABS(pkt, baddr0);
|
|
baddr = baddr0;
|
|
baddr += headroom;
|
|
|
|
/* Copy headers from the original packet */
|
|
if (n != 1) {
|
|
ASSERT(pkt != first_pkt);
|
|
pkt_copy(orig_pkt_baddr, baddr, total_hlen);
|
|
fsw_pkt_copy_metadata(first_pkt, pkt);
|
|
|
|
ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
|
|
/* flow info still needs to be updated below */
|
|
bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
|
|
sizeof(*pkt->pkt_flow));
|
|
pkt->pkt_trace_id = 0;
|
|
ASSERT(pkt->pkt_headroom == headroom);
|
|
} else {
|
|
METADATA_SET_LEN(pkt, 0, 0);
|
|
}
|
|
baddr += total_hlen;
|
|
|
|
/* Copy/checksum the payload from the original packet */
|
|
if (off + payload_sz > total_len) {
|
|
payload_sz = (uint16_t)(total_len - off);
|
|
}
|
|
pkt_copypkt_sum(orig_ph,
|
|
(uint16_t)(orig_pkt->pkt_headroom + off),
|
|
SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
|
|
&partial, TRUE);
|
|
|
|
DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
|
|
ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
|
|
uint16_t, mss, uint32_t, partial);
|
|
FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
|
|
|
|
/*
|
|
* Adjust header information and fill in the missing fields.
|
|
*/
|
|
if (af == AF_INET) {
|
|
ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
|
|
tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
|
|
|
|
if (n != n_pkts) {
|
|
tcp->th_flags &= ~(TH_FIN | TH_PUSH);
|
|
}
|
|
if (n != 1) {
|
|
tcp->th_flags &= ~TH_CWR;
|
|
tcp->th_seq = htonl(tcp_seq);
|
|
}
|
|
update_flow_info(pkt, ip, tcp, payload_sz);
|
|
|
|
ip->ip_id = htons((ipid)++);
|
|
ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
|
|
ip->ip_sum = 0;
|
|
ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
|
|
tcp->th_sum = 0;
|
|
partial = __packet_cksum(tcp, tcp_hlen, partial);
|
|
partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
|
|
partial += pseudo_hdr_csum;
|
|
ADDCARRY(partial);
|
|
tcp->th_sum = ~(uint16_t)partial;
|
|
} else {
|
|
ASSERT(af == AF_INET6);
|
|
ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
|
|
tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
|
|
|
|
if (n != n_pkts) {
|
|
tcp->th_flags &= ~(TH_FIN | TH_PUSH);
|
|
}
|
|
if (n != 1) {
|
|
tcp->th_flags &= ~TH_CWR;
|
|
tcp->th_seq = htonl(tcp_seq);
|
|
}
|
|
update_flow_info(pkt, ip6, tcp, payload_sz);
|
|
|
|
ip6->ip6_plen = htons(tcp_hlen + payload_sz);
|
|
tcp->th_sum = 0;
|
|
partial = __packet_cksum(tcp, tcp_hlen, partial);
|
|
partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
|
|
partial += pseudo_hdr_csum;
|
|
ADDCARRY(partial);
|
|
tcp->th_sum = ~(uint16_t)partial;
|
|
}
|
|
tcp_seq += payload_sz;
|
|
METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
|
|
#if (DEVELOPMENT || DEBUG)
|
|
struct __kern_buflet *bft;
|
|
uint32_t blen;
|
|
PKT_GET_FIRST_BUFLET(pkt, 1, bft);
|
|
blen = __buflet_get_data_length(bft);
|
|
if (blen != total_hlen + payload_sz) {
|
|
panic("blen (%d) != total_len + payload_sz (%d)\n",
|
|
blen, total_hlen + payload_sz);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
|
|
pkt->pkt_length = total_hlen + payload_sz;
|
|
KPKTQ_ENQUEUE(gso_pktq, pkt);
|
|
pkt = NULL;
|
|
|
|
/*
|
|
* Note that at this point the packet is not yet finalized.
|
|
* The finalization happens in dp_flow_tx_process() after
|
|
* the framing is done.
|
|
*/
|
|
}
|
|
ASSERT(n == n_pkts);
|
|
ASSERT(off == total_len);
|
|
DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
|
|
uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
|
|
uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
|
|
struct pktq *gso_pktq)
|
|
{
|
|
if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
|
|
ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
|
|
TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
|
|
KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
|
|
KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
|
|
KPKTQ_INIT(gso_pktq);
|
|
} else {
|
|
ASSERT(!TAILQ_EMPTY(fes));
|
|
KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
|
|
KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
|
|
KPKTQ_INIT(gso_pktq);
|
|
flow_entry_release(&fe);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
|
|
uint32_t gso_pkts_estimate)
|
|
{
|
|
struct __kern_packet *spkt, *pkt;
|
|
struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
|
|
struct flow_entry *fe, *prev_fe;
|
|
struct pktq dpktq;
|
|
struct nexus_adapter *dev_na;
|
|
struct kern_pbufpool *dev_pp;
|
|
struct ifnet *ifp = NULL;
|
|
sa_family_t af;
|
|
uint32_t n_pkts, n_flows = 0;
|
|
int err;
|
|
|
|
KPKTQ_INIT(&dpktq);
|
|
n_pkts = KPKTQ_LEN(spktq);
|
|
|
|
FSW_RLOCK(fsw);
|
|
if (__improbable(FSW_QUIESCED(fsw))) {
|
|
DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
|
|
SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
|
|
dp_drop_pktq(fsw, spktq);
|
|
goto done;
|
|
}
|
|
dev_na = fsw->fsw_dev_ch->ch_na;
|
|
if (__improbable(dev_na == NULL)) {
|
|
SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
|
|
FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
|
|
dp_drop_pktq(fsw, spktq);
|
|
goto done;
|
|
}
|
|
ifp = fsw_datamov_begin(fsw);
|
|
if (ifp == NULL) {
|
|
SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
|
|
dp_drop_pktq(fsw, spktq);
|
|
goto done;
|
|
}
|
|
|
|
dev_pp = na_kr_get_pp(dev_na, NR_TX);
|
|
|
|
/*
|
|
* Batch allocate enough packets to perform GSO on all
|
|
* packets in spktq.
|
|
*/
|
|
err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
|
|
gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
|
|
#if DEVELOPMENT || DEBUG
|
|
if (__probable(err != ENOMEM)) {
|
|
_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
|
|
}
|
|
#endif /* DEVELOPMENT || DEBUG */
|
|
/*
|
|
* We either get all packets or none. No partial allocations.
|
|
*/
|
|
if (__improbable(err != 0)) {
|
|
if (err == ENOMEM) {
|
|
ASSERT(KPKTQ_EMPTY(&dpktq));
|
|
} else {
|
|
dp_free_pktq(fsw, &dpktq);
|
|
}
|
|
DTRACE_SKYWALK1(gso__no__mem, int, err);
|
|
dp_drop_pktq(fsw, spktq);
|
|
FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
|
|
SK_ERR("failed to alloc %u pkts from device pool",
|
|
gso_pkts_estimate);
|
|
goto done;
|
|
}
|
|
prev_fe = NULL;
|
|
KPKTQ_FOREACH(spkt, spktq) {
|
|
KPKTQ_DEQUEUE(&dpktq, pkt);
|
|
ASSERT(pkt != NULL);
|
|
/*
|
|
* Copy only headers to the first packet of the GSO chain.
|
|
* The headers will be used for classification below.
|
|
*/
|
|
err = dp_copy_headers_to_dev(fsw, spkt, pkt);
|
|
if (__improbable(err != 0)) {
|
|
pp_free_packet_single(pkt);
|
|
DTRACE_SKYWALK2(copy__headers__failed,
|
|
struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, spkt);
|
|
continue;
|
|
}
|
|
af = get_tso_af(pkt);
|
|
ASSERT(af == AF_INET || af == AF_INET6);
|
|
|
|
err = flow_pkt_classify(pkt, ifp, af, false);
|
|
if (__improbable(err != 0)) {
|
|
dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
|
|
pp_free_packet_single(pkt);
|
|
DTRACE_SKYWALK4(classify__failed,
|
|
struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, spkt,
|
|
struct __kern_packet *, pkt,
|
|
int, err);
|
|
continue;
|
|
}
|
|
/*
|
|
* GSO cannot be done on a fragment and it's a bug in user
|
|
* space to mark a fragment as needing GSO.
|
|
*/
|
|
if (__improbable(pkt->pkt_flow_ip_is_frag)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
|
|
pp_free_packet_single(pkt);
|
|
DTRACE_SKYWALK3(is__frag,
|
|
struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, spkt,
|
|
struct __kern_packet *, pkt);
|
|
continue;
|
|
}
|
|
fe = tx_lookup_flow(fsw, pkt, prev_fe);
|
|
if (__improbable(fe == NULL)) {
|
|
FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
|
|
pp_free_packet_single(pkt);
|
|
DTRACE_SKYWALK3(lookup__failed,
|
|
struct nx_flowswitch *, fsw,
|
|
struct __kern_packet *, spkt,
|
|
struct __kern_packet *, pkt);
|
|
prev_fe = NULL;
|
|
continue;
|
|
}
|
|
/*
|
|
* Perform GSO on spkt using the flow information
|
|
* obtained above.
|
|
*/
|
|
struct pktq gso_pktq;
|
|
KPKTQ_INIT(&gso_pktq);
|
|
err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
|
|
if (__probable(err == 0)) {
|
|
tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
|
|
prev_fe = fe;
|
|
} else {
|
|
DTRACE_SKYWALK1(gso__error, int, err);
|
|
/* TODO: increment error stat */
|
|
pp_free_packet_single(pkt);
|
|
flow_entry_release(&fe);
|
|
prev_fe = NULL;
|
|
}
|
|
KPKTQ_FINI(&gso_pktq);
|
|
}
|
|
struct flow_entry *tfe = NULL;
|
|
TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
|
|
/* Chain-enqueue can be used for GSO chains */
|
|
tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
|
|
TAILQ_REMOVE(&fes, fe, fe_tx_link);
|
|
flow_entry_release(&fe);
|
|
n_flows++;
|
|
}
|
|
done:
|
|
FSW_RUNLOCK(fsw);
|
|
if (n_flows > 0) {
|
|
netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
|
|
}
|
|
if (ifp != NULL) {
|
|
fsw_datamov_end(fsw);
|
|
}
|
|
|
|
/*
|
|
* It's possible for packets to be left in dpktq because
|
|
* gso_pkts_estimate is only an estimate. The actual number
|
|
* of packets needed could be less.
|
|
*/
|
|
uint32_t dpktq_len;
|
|
if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
|
|
DTRACE_SKYWALK2(leftover__dev__pkts,
|
|
struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
|
|
dp_free_pktq(fsw, &dpktq);
|
|
}
|
|
KPKTQ_FINI(&dpktq);
|
|
}
|
|
|
|
static inline void
|
|
fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
|
|
struct proc *p)
|
|
{
|
|
#pragma unused(p)
|
|
uint32_t total_pkts = 0, total_bytes = 0;
|
|
|
|
for (;;) {
|
|
struct pktq pktq;
|
|
KPKTQ_INIT(&pktq);
|
|
uint32_t n_bytes;
|
|
fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
|
|
if (n_bytes == 0) {
|
|
break;
|
|
}
|
|
total_pkts += KPKTQ_LEN(&pktq);
|
|
total_bytes += n_bytes;
|
|
|
|
if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
|
|
fsw_receive(fsw, &pktq);
|
|
} else {
|
|
fsw_dev_input_netem_enqueue(fsw, &pktq);
|
|
}
|
|
KPKTQ_FINI(&pktq);
|
|
}
|
|
|
|
KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
|
|
DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
|
|
uint32_t, total_bytes);
|
|
|
|
/* compute mitigation rate for delivered traffic */
|
|
if (__probable(r->ckr_netif_mit_stats != NULL)) {
|
|
r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
|
|
struct proc *p)
|
|
{
|
|
#pragma unused(p)
|
|
static packet_trace_id_t trace_id = 0;
|
|
uint32_t total_pkts = 0, total_bytes = 0;
|
|
|
|
for (;;) {
|
|
struct pktq pktq;
|
|
KPKTQ_INIT(&pktq);
|
|
uint32_t n_bytes;
|
|
uint32_t gso_pkts_estimate = 0;
|
|
|
|
fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
|
|
&gso_pkts_estimate);
|
|
if (n_bytes == 0) {
|
|
break;
|
|
}
|
|
total_pkts += KPKTQ_LEN(&pktq);
|
|
total_bytes += n_bytes;
|
|
|
|
KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
|
|
KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
|
|
KPKTQ_FIRST(&pktq)->pkt_trace_id);
|
|
|
|
if (gso_pkts_estimate > 0) {
|
|
dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
|
|
} else {
|
|
dp_tx_pktq(fsw, &pktq);
|
|
}
|
|
dp_free_pktq(fsw, &pktq);
|
|
KPKTQ_FINI(&pktq);
|
|
}
|
|
kr_update_stats(r, total_pkts, total_bytes);
|
|
|
|
KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
|
|
DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
|
|
uint32_t, total_bytes);
|
|
}
|
|
|
|
void
|
|
fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
|
|
struct proc *p)
|
|
{
|
|
struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
|
|
|
|
ASSERT(sk_is_sync_protected());
|
|
ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
|
|
ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
|
|
|
|
if (vpna->vpna_nx_port == FSW_VP_DEV) {
|
|
fsw_dev_ring_flush(fsw, r, p);
|
|
} else {
|
|
fsw_user_ring_flush(fsw, r, p);
|
|
}
|
|
}
|
|
|
|
int
|
|
fsw_dp_ctor(struct nx_flowswitch *fsw)
|
|
{
|
|
uint32_t fe_cnt = fsw_fe_table_size;
|
|
uint32_t fob_cnt = fsw_flow_owner_buckets;
|
|
uint32_t frb_cnt = fsw_flow_route_buckets;
|
|
uint32_t frib_cnt = fsw_flow_route_id_buckets;
|
|
struct kern_nexus *nx = fsw->fsw_nx;
|
|
char name[64];
|
|
int error = 0;
|
|
|
|
/* just in case */
|
|
if (fe_cnt == 0) {
|
|
fe_cnt = NX_FSW_FE_TABLESZ;
|
|
ASSERT(fe_cnt != 0);
|
|
}
|
|
if (fob_cnt == 0) {
|
|
fob_cnt = NX_FSW_FOB_HASHSZ;
|
|
ASSERT(fob_cnt != 0);
|
|
}
|
|
if (frb_cnt == 0) {
|
|
frb_cnt = NX_FSW_FRB_HASHSZ;
|
|
ASSERT(frb_cnt != 0);
|
|
}
|
|
if (frib_cnt == 0) {
|
|
frib_cnt = NX_FSW_FRIB_HASHSZ;
|
|
ASSERT(frib_cnt != 0);
|
|
}
|
|
|
|
/* make sure fe_cnt is a power of two, else round up */
|
|
if ((fe_cnt & (fe_cnt - 1)) != 0) {
|
|
fe_cnt--;
|
|
fe_cnt |= (fe_cnt >> 1);
|
|
fe_cnt |= (fe_cnt >> 2);
|
|
fe_cnt |= (fe_cnt >> 4);
|
|
fe_cnt |= (fe_cnt >> 8);
|
|
fe_cnt |= (fe_cnt >> 16);
|
|
fe_cnt++;
|
|
}
|
|
|
|
/* make sure frb_cnt is a power of two, else round up */
|
|
if ((frb_cnt & (frb_cnt - 1)) != 0) {
|
|
frb_cnt--;
|
|
frb_cnt |= (frb_cnt >> 1);
|
|
frb_cnt |= (frb_cnt >> 2);
|
|
frb_cnt |= (frb_cnt >> 4);
|
|
frb_cnt |= (frb_cnt >> 8);
|
|
frb_cnt |= (frb_cnt >> 16);
|
|
frb_cnt++;
|
|
}
|
|
|
|
lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
|
|
&nexus_lock_attr);
|
|
lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
|
|
lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
|
|
TAILQ_INIT(&fsw->fsw_linger_head);
|
|
|
|
(void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
|
|
error = nx_advisory_alloc(nx, name,
|
|
&NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
|
|
NEXUS_ADVISORY_TYPE_FLOWSWITCH);
|
|
if (error != 0) {
|
|
fsw_dp_dtor(fsw);
|
|
return error;
|
|
}
|
|
|
|
fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
|
|
if (fsw->fsw_flow_mgr == NULL) {
|
|
fsw_dp_dtor(fsw);
|
|
return error;
|
|
}
|
|
|
|
/* generic name; will be customized upon ifattach */
|
|
(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
|
|
FSW_REAP_THREADNAME, name, "");
|
|
|
|
if (kernel_thread_start(fsw_reap_thread_func, fsw,
|
|
&fsw->fsw_reap_thread) != KERN_SUCCESS) {
|
|
panic_plain("%s: can't create thread", __func__);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
/* this must not fail */
|
|
VERIFY(fsw->fsw_reap_thread != NULL);
|
|
|
|
SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
|
|
|
|
|
|
return error;
|
|
}
|
|
|
|
void
|
|
fsw_dp_dtor(struct nx_flowswitch *fsw)
|
|
{
|
|
uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
|
|
uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
|
|
uint32_t i = 0;
|
|
|
|
#if (DEVELOPMENT || DEBUG)
|
|
if (fsw->fsw_rps_threads != NULL) {
|
|
for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
|
|
fsw_rps_thread_join(fsw, i);
|
|
}
|
|
kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
|
|
}
|
|
#endif /* !DEVELOPMENT && !DEBUG */
|
|
|
|
nx_advisory_free(fsw->fsw_nx);
|
|
|
|
if (fsw->fsw_reap_thread != THREAD_NULL) {
|
|
/* signal thread to begin self-termination */
|
|
lck_mtx_lock(&fsw->fsw_reap_lock);
|
|
fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
|
|
|
|
/*
|
|
* And wait for thread to terminate; use another
|
|
* wait channel here other than fsw_reap_flags to
|
|
* make it more explicit. In the event the reaper
|
|
* thread misses a wakeup, we'll try again once
|
|
* every second (except for the first time).
|
|
*/
|
|
while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
|
|
uint64_t t = 0;
|
|
|
|
nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
|
|
clock_absolutetime_interval_to_deadline(t, &t);
|
|
ASSERT(t != 0);
|
|
|
|
fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
|
|
if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
|
|
thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
|
|
}
|
|
(void) assert_wait_deadline(&fsw->fsw_reap_thread,
|
|
THREAD_UNINT, t);
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
thread_block(THREAD_CONTINUE_NULL);
|
|
lck_mtx_lock(&fsw->fsw_reap_lock);
|
|
fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
|
|
}
|
|
ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
fsw->fsw_reap_thread = THREAD_NULL;
|
|
}
|
|
|
|
/* free any remaining flow entries in the linger list */
|
|
fsw_linger_purge(fsw);
|
|
|
|
if (fsw->fsw_flow_mgr != NULL) {
|
|
flow_mgr_destroy(fsw->fsw_flow_mgr);
|
|
fsw->fsw_flow_mgr = NULL;
|
|
}
|
|
|
|
|
|
lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
|
|
lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
|
|
lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
|
|
}
|
|
|
|
void
|
|
fsw_linger_insert(struct flow_entry *fe)
|
|
{
|
|
struct nx_flowswitch *fsw = fe->fe_fsw;
|
|
SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
|
|
SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
|
|
fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
|
|
fe->fe_flags, FLOWENTF_BITS);
|
|
|
|
net_update_uptime();
|
|
|
|
ASSERT(flow_entry_refcnt(fe) >= 1);
|
|
ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
|
|
ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
|
|
ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
|
|
ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
|
|
ASSERT(fe->fe_linger_wait != 0);
|
|
fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
|
|
os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
|
|
|
|
lck_mtx_lock_spin(&fsw->fsw_linger_lock);
|
|
TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
|
|
fsw->fsw_linger_cnt++;
|
|
VERIFY(fsw->fsw_linger_cnt != 0);
|
|
lck_mtx_unlock(&fsw->fsw_linger_lock);
|
|
|
|
fsw_reap_sched(fsw);
|
|
}
|
|
|
|
static void
|
|
fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
|
|
struct flow_entry *fe)
|
|
{
|
|
SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
|
|
SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
|
|
fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
|
|
fe->fe_flags, FLOWENTF_BITS);
|
|
|
|
ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
|
|
ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
|
|
ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
|
|
os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
|
|
|
|
TAILQ_REMOVE(linger_head, fe, fe_linger_link);
|
|
flow_entry_release(&fe);
|
|
}
|
|
|
|
static void
|
|
fsw_linger_remove(struct flow_entry *fe)
|
|
{
|
|
struct nx_flowswitch *fsw = fe->fe_fsw;
|
|
|
|
LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
|
|
|
|
fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
|
|
VERIFY(fsw->fsw_linger_cnt != 0);
|
|
fsw->fsw_linger_cnt--;
|
|
}
|
|
|
|
void
|
|
fsw_linger_purge(struct nx_flowswitch *fsw)
|
|
{
|
|
struct flow_entry *fe, *tfe;
|
|
|
|
lck_mtx_lock(&fsw->fsw_linger_lock);
|
|
TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
|
|
fsw_linger_remove(fe);
|
|
}
|
|
ASSERT(fsw->fsw_linger_cnt == 0);
|
|
ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
|
|
lck_mtx_unlock(&fsw->fsw_linger_lock);
|
|
}
|
|
|
|
void
|
|
fsw_reap_sched(struct nx_flowswitch *fsw)
|
|
{
|
|
ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
|
|
lck_mtx_lock_spin(&fsw->fsw_reap_lock);
|
|
if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
|
|
!(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
|
|
thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
|
|
}
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
}
|
|
|
|
__attribute__((noreturn))
|
|
static void
|
|
fsw_reap_thread_func(void *v, wait_result_t w)
|
|
{
|
|
#pragma unused(w)
|
|
struct nx_flowswitch *fsw = v;
|
|
|
|
ASSERT(fsw->fsw_reap_thread == current_thread());
|
|
thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
|
|
|
|
net_update_uptime();
|
|
|
|
lck_mtx_lock(&fsw->fsw_reap_lock);
|
|
VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
|
|
(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
thread_block_parameter(fsw_reap_thread_cont, fsw);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
__attribute__((noreturn))
|
|
static void
|
|
fsw_reap_thread_cont(void *v, wait_result_t wres)
|
|
{
|
|
struct nx_flowswitch *fsw = v;
|
|
boolean_t low;
|
|
uint64_t t = 0;
|
|
|
|
SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
|
|
|
|
lck_mtx_lock(&fsw->fsw_reap_lock);
|
|
if (__improbable(wres == THREAD_INTERRUPTED ||
|
|
(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
|
|
goto terminate;
|
|
}
|
|
|
|
ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
|
|
fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
|
|
net_update_uptime();
|
|
|
|
/* prevent detach from happening while we're here */
|
|
if (!fsw_detach_barrier_add(fsw)) {
|
|
SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
|
|
t = 0;
|
|
} else {
|
|
uint32_t fe_nonviable, fe_freed, fe_aborted;
|
|
uint32_t fr_freed, fr_resid = 0;
|
|
struct ifnet *ifp = fsw->fsw_ifp;
|
|
uint64_t i = FSW_REAP_IVAL;
|
|
uint64_t now = _net_uptime;
|
|
uint64_t last;
|
|
|
|
ASSERT(fsw->fsw_ifp != NULL);
|
|
|
|
/*
|
|
* Pass 1: process any deferred {withdrawn,nonviable} requests.
|
|
*/
|
|
fe_nonviable = fsw_process_deferred(fsw);
|
|
|
|
/*
|
|
* Pass 2: remove any expired lingering flows.
|
|
*/
|
|
fe_freed = fsw_process_linger(fsw, &fe_aborted);
|
|
|
|
/*
|
|
* Pass 3: prune idle flow routes.
|
|
*/
|
|
fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
|
|
ifp, &fr_resid);
|
|
|
|
/*
|
|
* Pass 4: prune flow table
|
|
*
|
|
*/
|
|
cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
|
|
|
|
SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
|
|
"fe_aborted %u fr_freed %u/%u",
|
|
fsw->fsw_flow_mgr->fm_name, fe_nonviable,
|
|
(fe_nonviable + fsw->fsw_pending_nonviable),
|
|
fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
|
|
(fe_freed + fr_resid));
|
|
|
|
/* see if VM memory level is critical */
|
|
low = skmem_lowmem_check();
|
|
|
|
/*
|
|
* If things appear to be idle, we can prune away cached
|
|
* object that have fallen out of the working sets (this
|
|
* is different than purging). Every once in a while, we
|
|
* also purge the caches. Note that this is done across
|
|
* all flowswitch instances, and so we limit this to no
|
|
* more than once every FSW_REAP_SK_THRES seconds.
|
|
*/
|
|
last = os_atomic_load(&fsw_reap_last, relaxed);
|
|
if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
|
|
os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
|
|
fsw_purge_cache(fsw, low);
|
|
|
|
/* increase sleep interval if idle */
|
|
if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
|
|
fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
|
|
i <<= 3;
|
|
}
|
|
} else if (last == 0) {
|
|
os_atomic_store(&fsw_reap_last, now, release);
|
|
}
|
|
|
|
/*
|
|
* Additionally, run thru the list of channels and prune
|
|
* or purge away cached objects on "idle" channels. This
|
|
* check is rate limited to no more than once every
|
|
* FSW_DRAIN_CH_THRES seconds.
|
|
*/
|
|
last = fsw->fsw_drain_channel_chk_last;
|
|
if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
|
|
SK_DF(SK_VERB_FLOW, "%s: pruning channels",
|
|
fsw->fsw_flow_mgr->fm_name);
|
|
|
|
fsw->fsw_drain_channel_chk_last = now;
|
|
fsw_drain_channels(fsw, now, low);
|
|
} else if (__improbable(last == 0)) {
|
|
fsw->fsw_drain_channel_chk_last = now;
|
|
}
|
|
|
|
/*
|
|
* Finally, invoke the interface's reap callback to
|
|
* tell it to prune or purge away cached objects if
|
|
* it is idle. This check is rate limited to no more
|
|
* than once every FSW_REAP_IF_THRES seconds.
|
|
*/
|
|
last = fsw->fsw_drain_netif_chk_last;
|
|
if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
|
|
ASSERT(fsw->fsw_nifna != NULL);
|
|
|
|
if (ifp->if_na_ops != NULL &&
|
|
ifp->if_na_ops->ni_reap != NULL) {
|
|
SK_DF(SK_VERB_FLOW, "%s: pruning netif",
|
|
fsw->fsw_flow_mgr->fm_name);
|
|
ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
|
|
FSW_REAP_IF_THRES, low);
|
|
}
|
|
|
|
fsw->fsw_drain_netif_chk_last = now;
|
|
} else if (__improbable(last == 0)) {
|
|
fsw->fsw_drain_netif_chk_last = now;
|
|
}
|
|
|
|
/* emit periodic interface stats ktrace */
|
|
last = fsw->fsw_reap_last;
|
|
if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
|
|
KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
|
|
ifp->if_data.ifi_ibytes * 8,
|
|
ifp->if_data.ifi_opackets,
|
|
ifp->if_data.ifi_obytes * 8);
|
|
|
|
fsw->fsw_reap_last = now;
|
|
} else if (__improbable(last == 0)) {
|
|
fsw->fsw_reap_last = now;
|
|
}
|
|
|
|
nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
|
|
clock_absolutetime_interval_to_deadline(t, &t);
|
|
ASSERT(t != 0);
|
|
|
|
/* allow any pending detach to proceed */
|
|
fsw_detach_barrier_remove(fsw);
|
|
}
|
|
|
|
lck_mtx_lock(&fsw->fsw_reap_lock);
|
|
if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
|
|
fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
|
|
(void) assert_wait_deadline(&fsw->fsw_reap_flags,
|
|
THREAD_UNINT, t);
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
thread_block_parameter(fsw_reap_thread_cont, fsw);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
} else {
|
|
terminate:
|
|
LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
|
|
fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
|
|
fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
|
|
/*
|
|
* And signal any thread waiting for us to terminate;
|
|
* wait channel here other than fsw_reap_flags to make
|
|
* it more explicit.
|
|
*/
|
|
if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
|
|
thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
|
|
}
|
|
lck_mtx_unlock(&fsw->fsw_reap_lock);
|
|
|
|
SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
|
|
|
|
/* for the extra refcnt from kernel_thread_start() */
|
|
thread_deallocate(current_thread());
|
|
/* this is the end */
|
|
thread_terminate(current_thread());
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
/* must never get here */
|
|
VERIFY(0);
|
|
/* NOTREACHED */
|
|
__builtin_unreachable();
|
|
}
|
|
|
|
static void
|
|
fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
|
|
{
|
|
struct kern_nexus *nx = fsw->fsw_nx;
|
|
|
|
/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
|
|
FSW_RLOCK(fsw);
|
|
|
|
/* uncrustify doesn't handle C blocks properly */
|
|
/* BEGIN IGNORE CODESTYLE */
|
|
nx_port_foreach(nx, ^(nexus_port_t p) {
|
|
struct nexus_adapter *na = nx_port_get_na(nx, p);
|
|
if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
|
|
return;
|
|
}
|
|
|
|
boolean_t purge;
|
|
|
|
/*
|
|
* If some activity happened in the last FSW_DRAIN_CH_THRES
|
|
* seconds on this channel, we reclaim memory if the channel
|
|
* throughput is less than the reap threshold value.
|
|
*/
|
|
if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
|
|
struct __kern_channel_ring *ring;
|
|
channel_ring_stats *stats;
|
|
uint64_t bps;
|
|
|
|
ring = na->na_rx_rings;
|
|
stats = &ring->ckr_stats;
|
|
bps = stats->crs_bytes_per_second;
|
|
|
|
if (bps < fsw_channel_reap_thresh) {
|
|
purge = FALSE;
|
|
na_drain(na, purge);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* If NA has been inactive for some time (twice the drain
|
|
* threshold), we clear the work timestamp to temporarily skip
|
|
* this channel until it's active again. Purging cached objects
|
|
* can be expensive since we'd need to allocate and construct
|
|
* them again, so we do it only when necessary.
|
|
*/
|
|
if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
|
|
na->na_work_ts = 0;
|
|
purge = TRUE;
|
|
} else {
|
|
purge = FALSE;
|
|
}
|
|
|
|
na_drain(na, purge); /* purge/prune caches */
|
|
});
|
|
/* END IGNORE CODESTYLE */
|
|
|
|
FSW_RUNLOCK(fsw);
|
|
}
|
|
|
|
static void
|
|
fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
|
|
{
|
|
#pragma unused(fsw)
|
|
uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
|
|
uint32_t p = fsw_flow_purge_thresh;
|
|
boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
|
|
|
|
SK_DF(SK_VERB_FLOW, "%s: %s caches",
|
|
fsw->fsw_flow_mgr->fm_name,
|
|
(purge ? "purge" : "prune"));
|
|
|
|
skmem_cache_reap_now(sk_fo_cache, purge);
|
|
skmem_cache_reap_now(sk_fe_cache, purge);
|
|
skmem_cache_reap_now(sk_fab_cache, purge);
|
|
skmem_cache_reap_now(flow_route_cache, purge);
|
|
skmem_cache_reap_now(flow_stats_cache, purge);
|
|
netns_reap_caches(purge);
|
|
skmem_reap_caches(purge);
|
|
|
|
#if CONFIG_MBUF_MCACHE
|
|
if (if_is_fsw_transport_netagent_enabled() && purge) {
|
|
mbuf_drain(FALSE);
|
|
}
|
|
#endif /* CONFIG_MBUF_MCACHE */
|
|
}
|
|
|
|
static void
|
|
fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
|
|
{
|
|
/* When the interface is in low power mode, the flow is nonviable */
|
|
if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
|
|
os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
|
|
os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
|
|
}
|
|
}
|
|
|
|
static uint32_t
|
|
fsw_process_deferred(struct nx_flowswitch *fsw)
|
|
{
|
|
struct flow_entry_dead sfed __sk_aligned(8);
|
|
struct flow_mgr *fm = fsw->fsw_flow_mgr;
|
|
struct flow_entry_dead *fed, *tfed;
|
|
LIST_HEAD(, flow_entry_dead) fed_head =
|
|
LIST_HEAD_INITIALIZER(fed_head);
|
|
uint32_t i, nonviable = 0;
|
|
boolean_t lowpowermode = FALSE;
|
|
|
|
bzero(&sfed, sizeof(sfed));
|
|
|
|
/*
|
|
* The flows become nonviable when the interface
|
|
* is in low power mode (edge trigger)
|
|
*/
|
|
if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
|
|
fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
|
|
lowpowermode = TRUE;
|
|
fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
|
|
}
|
|
|
|
/*
|
|
* Scan thru the flow entry tree, and commit any pending withdraw or
|
|
* nonviable requests. We may need to push stats and/or unassign the
|
|
* nexus from NECP, but we cannot do that while holding the locks;
|
|
* build a temporary list for those entries.
|
|
*/
|
|
for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
|
|
struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
|
|
struct flow_owner *fo;
|
|
|
|
/*
|
|
* Grab the lock at all costs when handling low power mode
|
|
*/
|
|
if (__probable(!lowpowermode)) {
|
|
if (!FOB_TRY_LOCK(fob)) {
|
|
continue;
|
|
}
|
|
} else {
|
|
FOB_LOCK(fob);
|
|
}
|
|
|
|
FOB_LOCK_ASSERT_HELD(fob);
|
|
RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
|
|
struct flow_entry *fe;
|
|
|
|
RB_FOREACH(fe, flow_entry_id_tree,
|
|
&fo->fo_flow_entry_id_head) {
|
|
/* try first as reader; skip if we can't */
|
|
if (__improbable(lowpowermode)) {
|
|
fsw_flow_handle_low_power(fsw, fe);
|
|
}
|
|
if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
|
|
os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
|
|
flow_namespace_half_close(&fe->fe_port_reservation);
|
|
}
|
|
|
|
/* if not withdrawn/nonviable, skip */
|
|
if (!fe->fe_want_withdraw &&
|
|
!fe->fe_want_nonviable) {
|
|
continue;
|
|
}
|
|
/*
|
|
* Here we're holding the lock as writer;
|
|
* don't spend too much time as we're
|
|
* blocking the data path now.
|
|
*/
|
|
ASSERT(!uuid_is_null(fe->fe_uuid));
|
|
/* only need flow UUID and booleans */
|
|
uuid_copy(sfed.fed_uuid, fe->fe_uuid);
|
|
sfed.fed_want_clonotify =
|
|
(fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
|
|
sfed.fed_want_nonviable = fe->fe_want_nonviable;
|
|
flow_entry_teardown(fo, fe);
|
|
|
|
/* do this outside the flow bucket lock */
|
|
fed = flow_entry_dead_alloc(Z_WAITOK);
|
|
ASSERT(fed != NULL);
|
|
*fed = sfed;
|
|
LIST_INSERT_HEAD(&fed_head, fed, fed_link);
|
|
}
|
|
}
|
|
FOB_UNLOCK(fob);
|
|
}
|
|
|
|
/*
|
|
* These nonviable flows are no longer useful since we've lost
|
|
* the source IP address; in the event the client monitors the
|
|
* viability of the flow, explicitly mark it as nonviable so
|
|
* that a new flow can be created.
|
|
*/
|
|
LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
|
|
LIST_REMOVE(fed, fed_link);
|
|
ASSERT(fsw->fsw_agent_session != NULL);
|
|
|
|
/* if flow is closed early */
|
|
if (fed->fed_want_clonotify) {
|
|
necp_client_early_close(fed->fed_uuid);
|
|
}
|
|
|
|
/* if nonviable, unassign nexus attributes */
|
|
if (fed->fed_want_nonviable) {
|
|
(void) netagent_assign_nexus(fsw->fsw_agent_session,
|
|
fed->fed_uuid, NULL, 0);
|
|
}
|
|
|
|
flow_entry_dead_free(fed);
|
|
++nonviable;
|
|
}
|
|
ASSERT(LIST_EMPTY(&fed_head));
|
|
|
|
return nonviable;
|
|
}
|
|
|
|
static uint32_t
|
|
fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
|
|
{
|
|
struct flow_entry_linger_head linger_head =
|
|
TAILQ_HEAD_INITIALIZER(linger_head);
|
|
struct flow_entry *fe, *tfe;
|
|
uint64_t now = _net_uptime;
|
|
uint32_t i = 0, cnt = 0, freed = 0;
|
|
|
|
ASSERT(fsw->fsw_ifp != NULL);
|
|
ASSERT(abort != NULL);
|
|
*abort = 0;
|
|
|
|
/*
|
|
* We don't want to contend with the datapath, so move
|
|
* everything that's in the linger list into a local list.
|
|
* This allows us to generate RSTs or free the flow entry
|
|
* outside the lock. Any remaining flow entry in the local
|
|
* list will get re-added back to the head of the linger
|
|
* list, in front of any new ones added since then.
|
|
*/
|
|
lck_mtx_lock(&fsw->fsw_linger_lock);
|
|
TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
|
|
ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
|
|
cnt = fsw->fsw_linger_cnt;
|
|
fsw->fsw_linger_cnt = 0;
|
|
lck_mtx_unlock(&fsw->fsw_linger_lock);
|
|
|
|
TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
|
|
ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
|
|
ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
|
|
ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
|
|
|
|
/*
|
|
* See if this is a TCP flow that needs to generate
|
|
* a RST to the remote peer (if not already).
|
|
*/
|
|
if (flow_track_tcp_want_abort(fe)) {
|
|
VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
|
|
ASSERT(!uuid_is_null(fe->fe_uuid));
|
|
flow_track_abort_tcp(fe, NULL, NULL);
|
|
(*abort)++;
|
|
SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
|
|
SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
|
|
"flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
|
|
sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
|
|
FLOWENTF_BITS);
|
|
}
|
|
|
|
/*
|
|
* If flow has expired, remove from list and free;
|
|
* otherwise leave it around in the linger list.
|
|
*/
|
|
if (fe->fe_linger_expire <= now) {
|
|
freed++;
|
|
fsw_linger_remove_internal(&linger_head, fe);
|
|
fe = NULL;
|
|
}
|
|
++i;
|
|
}
|
|
VERIFY(i == cnt && cnt >= freed);
|
|
|
|
/*
|
|
* Add any remaining ones back into the linger list.
|
|
*/
|
|
lck_mtx_lock(&fsw->fsw_linger_lock);
|
|
if (!TAILQ_EMPTY(&linger_head)) {
|
|
ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
|
|
TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
|
|
ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
|
|
TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
|
|
fsw->fsw_linger_cnt += (cnt - freed);
|
|
}
|
|
ASSERT(TAILQ_EMPTY(&linger_head));
|
|
lck_mtx_unlock(&fsw->fsw_linger_lock);
|
|
|
|
return freed;
|
|
}
|
|
|
|
__attribute__((always_inline))
|
|
static inline void
|
|
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
|
|
{
|
|
switch (__packet_get_traffic_class(ph)) {
|
|
case PKT_TC_BE:
|
|
ifp->if_tc.ifi_ibepackets++;
|
|
ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
|
|
break;
|
|
case PKT_TC_BK:
|
|
ifp->if_tc.ifi_ibkpackets++;
|
|
ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
|
|
break;
|
|
case PKT_TC_VI:
|
|
ifp->if_tc.ifi_ivipackets++;
|
|
ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
|
|
break;
|
|
case PKT_TC_VO:
|
|
ifp->if_tc.ifi_ivopackets++;
|
|
ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
__attribute__((always_inline))
|
|
static inline void
|
|
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
|
|
uint32_t cnt, uint32_t len)
|
|
{
|
|
switch (svc) {
|
|
case PKT_TC_BE:
|
|
ifp->if_tc.ifi_obepackets += cnt;
|
|
ifp->if_tc.ifi_obebytes += len;
|
|
break;
|
|
case PKT_TC_BK:
|
|
ifp->if_tc.ifi_obkpackets += cnt;
|
|
ifp->if_tc.ifi_obkbytes += len;
|
|
break;
|
|
case PKT_TC_VI:
|
|
ifp->if_tc.ifi_ovipackets += cnt;
|
|
ifp->if_tc.ifi_ovibytes += len;
|
|
break;
|
|
case PKT_TC_VO:
|
|
ifp->if_tc.ifi_ovopackets += cnt;
|
|
ifp->if_tc.ifi_ovobytes += len;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|