gems-kernel/source/THIRDPARTY/xnu/bsd/skywalk/channel/channel_ring.c

2019 lines
61 KiB
C
Raw Normal View History

2024-06-03 16:29:39 +00:00
/*
* Copyright (c) 2015-2021 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <skywalk/os_skywalk_private.h>
#include <kern/sched_prim.h>
#include <sys/sdt.h>
static void kr_update_user_stats(struct __kern_channel_ring *,
uint32_t, uint32_t);
static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
const uint32_t, struct __kern_quantum *, struct proc *);
#define KR_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
static uint32_t kr_transfer_decay = 0;
#define KR_ACCUMULATE_INTERVAL 2 /* 2 seconds */
static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
#if (DEVELOPMENT || DEBUG)
#define KR_STAT_ENABLE 1
#else /* !(DEVELOPMENT || DEBUG) */
#define KR_STAT_ENABLE 0
#endif /* !(DEVELOPMENT || DEBUG) */
/* Enable/Disable ring stats collection */
uint32_t kr_stat_enable = KR_STAT_ENABLE;
#if (DEVELOPMENT || DEBUG)
SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
0, "ilog2 of EWMA decay rate of ring transfers");
SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
uint32_t kr_disable_panic_on_sync_err = 0;
SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
0, "disable panic on sync error");
#endif /* (DEVELOPMENT || DEBUG) */
SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
0, "enable/disable stats collection for ring");
#define KR_EWMA(old, new, decay) do { \
u_int64_t _avg; \
if (__probable((_avg = (old)) > 0)) \
_avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
else \
_avg = (new); \
(old) = _avg; \
} while (0)
#define _BUF_DLIM(_buf, _pp) (BUFLET_HAS_LARGE_BUF(_buf) ? \
PP_BUF_SIZE_LARGE(_pp) : PP_BUF_SIZE_DEF(_pp))
void
kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
{
uint32_t tail;
tail = nslots - 1;
kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
kring->ckr_num_slots = nslots;
*(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
kring->ckr_rhead = kring->ckr_khead = 0;
/* IMPORTANT: Always keep one slot empty */
kring->ckr_rtail = kring->ckr_ktail =
((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
}
/*
* Try to obtain exclusive right to issue the *sync() or state change
* operations on the ring. The right is obtained and must be later
* relinquished via kr_exit() if and only if kr_enter() returns 0.
*
* In all cases the caller will typically skip the ring, possibly collecting
* errors along the way.
*
* If the calling context does not allow sleeping, the caller must pass
* FALSE in can_sleep; EBUSY may be returned if the right is held by
* another thread. Otherwise, the caller may block until the right is
* released by the previous holder.
*/
int
kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
{
lck_spin_lock(&kr->ckr_slock);
if (kr->ckr_owner == current_thread()) {
ASSERT(kr->ckr_busy != 0);
kr->ckr_busy++;
goto done;
}
if (!can_sleep) {
if (kr->ckr_busy != 0) {
lck_spin_unlock(&kr->ckr_slock);
return EBUSY;
}
} else {
while (kr->ckr_busy != 0) {
kr->ckr_want++;
(void) assert_wait(&kr->ckr_busy, THREAD_UNINT);
lck_spin_unlock(&kr->ckr_slock);
(void) thread_block(THREAD_CONTINUE_NULL);
SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
"(0x%llx) busy=%u", kr->ckr_name,
SK_KVA(kr), kr->ckr_busy);
lck_spin_lock(&kr->ckr_slock);
}
}
LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
ASSERT(kr->ckr_busy == 0);
kr->ckr_busy++;
kr->ckr_owner = current_thread();
done:
lck_spin_unlock(&kr->ckr_slock);
SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired",
kr->ckr_name, SK_KVA(kr));
return 0;
}
void
kr_exit(struct __kern_channel_ring *kr)
{
uint32_t want = 0;
lck_spin_lock(&kr->ckr_slock);
ASSERT(kr->ckr_busy != 0);
ASSERT(kr->ckr_owner == current_thread());
if (--kr->ckr_busy == 0) {
kr->ckr_owner = NULL;
/*
* we're done with the kring;
* notify anyone that has lost the race
*/
if ((want = kr->ckr_want) != 0) {
kr->ckr_want = 0;
wakeup((void *)&kr->ckr_busy);
lck_spin_unlock(&kr->ckr_slock);
} else {
lck_spin_unlock(&kr->ckr_slock);
}
} else {
lck_spin_unlock(&kr->ckr_slock);
}
SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)",
kr->ckr_name, SK_KVA(kr), want);
}
void
kr_start(struct __kern_channel_ring *kr)
{
lck_spin_lock(&kr->ckr_slock);
ASSERT(kr->ckr_busy != 0);
ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
/* now clear the state */
kr->ckr_state = KR_READY;
lck_spin_unlock(&kr->ckr_slock);
kr_exit(kr);
SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started",
kr->ckr_name, SK_KVA(kr));
}
/*
* Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
* Also marks the ring as busy, which would require either kr_start() at a
* later point.
*/
void
kr_stop(struct __kern_channel_ring *kr, uint32_t state)
{
uint32_t s;
ASSERT(state == KR_STOPPED || state == KR_LOCKED);
s = kr_enter(kr, TRUE);
ASSERT(s == 0);
lck_spin_lock(&kr->ckr_slock);
ASSERT(kr->ckr_busy != 0);
/* now set the state */
kr->ckr_state = state;
lck_spin_unlock(&kr->ckr_slock);
SK_DF(SK_VERB_LOCKS,
"kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u",
kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state);
}
static void
kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
uint32_t byte_count)
{
uint64_t now;
uint32_t transfer_decay = (kr_transfer_decay != 0) ?
kr_transfer_decay : kring->ckr_transfer_decay;
channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
now = net_uptime();
kring->ckr_sync_time = now;
if (kr_stat_enable == 0) {
return;
}
stats->crsu_number_of_syncs++;
stats->crsu_total_bytes_transferred += byte_count;
stats->crsu_total_slots_transferred += slot_count;
if (slot_count > stats->crsu_max_slots_transferred) {
stats->crsu_max_slots_transferred = slot_count;
}
if (stats->crsu_min_slots_transferred == 0 ||
slot_count < stats->crsu_min_slots_transferred) {
stats->crsu_min_slots_transferred = slot_count;
}
if (__probable(kring->ckr_user_accumulate_start != 0)) {
if ((now - kring->ckr_user_accumulate_start) >=
kr_accumulate_interval) {
uint64_t bps;
uint64_t sps;
uint64_t sps_ma;
/* bytes per sync */
bps = kring->ckr_user_accumulated_bytes /
kring->ckr_user_accumulated_syncs;
KR_EWMA(stats->crsu_bytes_per_sync_ma,
bps, transfer_decay);
stats->crsu_bytes_per_sync = bps;
/* slots per sync */
sps = kring->ckr_user_accumulated_slots /
kring->ckr_user_accumulated_syncs;
sps_ma = stats->crsu_slots_per_sync_ma;
KR_EWMA(sps_ma, sps, transfer_decay);
stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
stats->crsu_slots_per_sync = (uint32_t)sps;
/* start over */
kring->ckr_user_accumulate_start = now;
kring->ckr_user_accumulated_bytes = 0;
kring->ckr_user_accumulated_slots = 0;
kring->ckr_user_accumulated_syncs = 0;
stats->crsu_min_slots_transferred = 0;
stats->crsu_max_slots_transferred = 0;
}
} else {
kring->ckr_user_accumulate_start = now;
}
kring->ckr_user_accumulated_bytes += byte_count;
kring->ckr_user_accumulated_slots += slot_count;
kring->ckr_user_accumulated_syncs++;
}
/* caller to make sure thread safety */
void
kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
uint32_t byte_count)
{
uint64_t now;
uint64_t diff_secs;
channel_ring_stats_t stats = &kring->ckr_stats;
uint32_t transfer_decay = (kr_transfer_decay != 0) ?
kr_transfer_decay : kring->ckr_transfer_decay;
if (kr_stat_enable == 0) {
return;
}
if (__improbable(slot_count == 0)) {
return;
}
stats->crs_number_of_transfers++;
stats->crs_total_bytes_transferred += byte_count;
stats->crs_total_slots_transferred += slot_count;
if (slot_count > stats->crs_max_slots_transferred) {
stats->crs_max_slots_transferred = slot_count;
}
if (stats->crs_min_slots_transferred == 0 ||
slot_count < stats->crs_min_slots_transferred) {
stats->crs_min_slots_transferred = slot_count;
}
now = net_uptime();
if (__probable(kring->ckr_accumulate_start != 0)) {
diff_secs = now - kring->ckr_accumulate_start;
if (diff_secs >= kr_accumulate_interval) {
uint64_t bps;
uint64_t sps;
uint64_t sps_ma;
/* bytes per second */
bps = kring->ckr_accumulated_bytes / diff_secs;
KR_EWMA(stats->crs_bytes_per_second_ma,
bps, transfer_decay);
stats->crs_bytes_per_second = bps;
/* slots per second */
sps = kring->ckr_accumulated_slots / diff_secs;
sps_ma = stats->crs_slots_per_second_ma;
KR_EWMA(sps_ma, sps, transfer_decay);
stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
stats->crs_slots_per_second = (uint32_t)sps;
/* start over */
kring->ckr_accumulate_start = now;
kring->ckr_accumulated_bytes = 0;
kring->ckr_accumulated_slots = 0;
stats->crs_min_slots_transferred = 0;
stats->crs_max_slots_transferred = 0;
}
} else {
kring->ckr_accumulate_start = now;
}
kring->ckr_accumulated_bytes += byte_count;
kring->ckr_accumulated_slots += slot_count;
}
/* True if no space in the tx ring. only valid after kr_txsync_prologue */
boolean_t
kr_txempty(struct __kern_channel_ring *kring)
{
return kring->ckr_rhead == kring->ckr_ktail;
}
#if SK_LOG
/*
* Error logging routine called when txsync/rxsync detects an error.
* Expected to be called before killing the process with skywalk_kill_process()
*
* This routine is only called by the upper half of the kernel.
* It only reads khead (which is changed only by the upper half, too)
* and ktail (which may be changed by the lower half, but only on
* a tx ring and only to increase it, so any error will be recovered
* on the next call). For the above, we don't strictly need to call
* it under lock.
*/
void
kr_log_bad_ring(struct __kern_channel_ring *kring)
{
struct __user_channel_ring *ring = kring->ckr_ring;
const slot_idx_t lim = kring->ckr_lim;
slot_idx_t i;
int errors = 0;
// XXX KASSERT nm_kr_tryget
SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name,
SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
// XXX probably wrong to trust userspace
if (ring->ring_head > lim) {
errors++;
}
if (ring->ring_tail > lim) {
errors++;
}
for (i = 0; i <= lim; i++) {
struct __kern_slot_desc *ksd = KR_KSD(kring, i);
struct __kern_quantum *kqum = ksd->sd_qum;
obj_idx_t idx;
uint32_t len;
if (!KSD_VALID_METADATA(ksd)) {
continue;
}
idx = METADATA_IDX(kqum);
len = kqum->qum_len;
if (len > kring->ckr_max_pkt_len) {
SK_RDERR(5, "bad len at slot %u idx %u len %u",
i, idx, len);
}
}
if (errors != 0) {
SK_ERR("total %d errors", errors);
SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, "
"head %u -> %u tail %u -> %u", kring->ckr_name,
SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head,
kring->ckr_rhead, kring->ckr_khead,
ring->ring_tail, kring->ckr_ktail);
}
}
#endif /* SK_LOG */
uint32_t
kr_reclaim(struct __kern_channel_ring *kr)
{
int r = 0;
VERIFY(sk_is_sync_protected());
/*
* This is a no-op for TX ring, since the TX reclaim logic is only
* known to the nexus itself. There, the nexus's TX sync code would
* figure out the number of slots that has been "transmitted", and
* advance the slot pointer accordingly. This routine would then be
* called as a way to advise the system of such condition.
*
* For RX ring, this will reclaim user-released slots, and it is
* to be called by the provider's RX sync routine prior to its
* processing new slots (into the RX ring).
*
* It is therefore advised that this routine be called at the start
* of the RX sync callback, as well as at the end of the TX sync
* callback; the latter is useful in case we decide to implement
* more logic in future.
*/
if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
/* # of reclaimed slots */
r = kr->ckr_rhead - kr->ckr_khead;
if (r < 0) {
r += kr->ckr_num_slots;
}
kr->ckr_khead = kr->ckr_rhead;
/* ensure global visibility */
os_atomic_thread_fence(seq_cst);
}
return (slot_idx_t)r;
}
/*
* Nexus-specific kr_txsync_prologue() callback.
*/
int
kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
struct proc *p)
{
struct kern_pbufpool *pp = kring->ckr_pp;
const uint32_t maxfrags = pp->pp_max_frags;
slot_idx_t slot_idx = kring->ckr_rhead;
ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __kern_quantum *kqum = ksd->sd_qum;
int err;
if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
SK_ERR("qum index mismatch");
*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
return -1;
}
/* Internalize */
err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
if (__improbable(err != 0)) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
"(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p),
kring->ckr_name, SK_KVA(kring), slot_idx, err,
kring->ckr_khead, kring->ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
return -1;
}
*byte_count += kqum->qum_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
return 0;
}
/*
* Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
*/
int
kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
struct proc *p)
{
struct kern_pbufpool *pp = kring->ckr_pp;
const uint32_t maxfrags = pp->pp_max_frags;
slot_idx_t slot_idx = kring->ckr_rhead;
struct __kern_quantum *kqum = NULL;
bool free_pkt = false;
int err = 0;
ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
PP_LOCK(pp);
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
/*
* The channel is operating in user packet pool mode;
* check if the packet is in the allocated list.
*/
kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
if (__improbable(err != 0)) {
if (kqum != NULL) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
"kqum %p, bad buflet chain",
sk_proc_name_address(p), sk_proc_pid(p),
kring->ckr_name, SK_KVA(kring), slot_idx,
SK_KVA(kqum));
*err_reason =
SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
goto done;
}
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
" unallocated packet %u kh %u kt %u | "
"rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p),
kring->ckr_name, SK_KVA(kring), slot_idx,
usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
goto done;
}
if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
SK_ERR("qum index mismatch");
*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
err = ERANGE;
free_pkt = true;
goto done;
}
/* Internalize */
err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
if (__improbable(err != 0)) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
"(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p),
kring->ckr_name, SK_KVA(kring), slot_idx, err,
kring->ckr_khead, kring->ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
free_pkt = true;
goto done;
}
/*
* Attach packet to slot, detach mapping from alloc ring slot.
*/
kqum->qum_ksd = NULL;
USD_RESET(usd);
KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
*byte_count += kqum->qum_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
done:
PP_UNLOCK(pp);
if (__improbable(err != 0) && free_pkt) {
ASSERT(kqum != NULL);
kqum->qum_ksd = NULL;
pp_free_packet(pp, (uint64_t)kqum);
}
return err;
}
#define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
err_reason = reason; goto error; }
/*
* Validate parameters in the TX/FREE ring/kring.
*
* ckr_rhead, ckr_rtail=ktail are stored from previous round.
* khead is the next packet to send to the ring.
*
* We want
* khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
*
* ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
*/
#define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
slot_idx_t _n = (_kring)->ckr_num_slots; \
/* kernel sanity checks */ \
NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
(_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
/* user basic sanity checks */ \
NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
/* \
* user sanity checks. We only use 'cur', \
* A, B, ... are possible positions for cur: \
* \
* 0 A cur B tail C n-1 \
* 0 D tail E cur F n-1 \
* \
* B, F, D are valid. A, C, E are wrong \
*/ \
if ((_krt) >= kring->ckr_rhead) { \
/* want ckr_rhead <= head <= ckr_rtail */ \
NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt), \
SKYWALK_KILL_REASON_HEAD_OOB); \
} else { /* here ckr_rtail < ckr_rhead */ \
/* we need head outside ckr_rtail .. ckr_rhead */ \
NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead, \
SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
} \
NM_FAIL_ON(ring->ring_tail != (_krt), \
SKYWALK_KILL_REASON_TAIL_MISMATCH); \
} while (0)
/*
* Validate parameters in the ring/kring on entry for *_txsync().
* Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
* in case of error, in order to force a reinit.
*/
slot_idx_t
kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
struct proc *p)
{
struct __user_channel_ring *ring = kring->ckr_ring;
slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
slot_idx_t head;
uint32_t byte_count = 0;
uint64_t err_reason = 0;
int slot_count;
VERIFY(sk_is_sync_protected());
/* assert that this routine is only called for user facing rings */
ASSERT(!KR_KERNEL_ONLY(kring));
ASSERT(kring->ckr_usds != NULL);
/* read these once and use local copies */
head = ring->ring_head;
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
os_atomic_thread_fence(seq_cst);
ckr_rtail = kring->ckr_rtail;
SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, ckr_rtail,
ring->ring_head, ring->ring_tail);
_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
/* # of new tx slots */
slot_count = head - kring->ckr_rhead;
if (slot_count < 0) {
slot_count += kring->ckr_num_slots;
}
/*
* Invoke nexus-specific TX prologue callback, set in na_kr_create().
*/
if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
kring, head, &byte_count, &err_reason, p) != 0)) {
goto error;
}
/* update the user's view of slots & bytes transferred */
kr_update_user_stats(kring, slot_count, byte_count);
/* update the kernel view of ring */
kring->ckr_rhead = head;
/* save for kr_txsync_finalize(); only khead is needed */
kring->ckr_khead_pre = ckr_khead;
return head;
error:
SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
"rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
ckr_rtail, head, ring->ring_tail);
skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
return kring->ckr_num_slots;
}
/*
* Validate parameters in the ring/kring on entry for *_free_sync().
* Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
* in case of error, in order to force a reinit.
*/
slot_idx_t
kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
{
struct __user_channel_ring *ring = kring->ckr_ring;
slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
slot_idx_t head;
uint64_t err_reason = 0;
VERIFY(sk_is_sync_protected());
/* read these once and use local copies */
head = ring->ring_head;
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
os_atomic_thread_fence(seq_cst);
ckr_rtail = kring->ckr_rtail;
SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
/* update the kernel view of ring */
kring->ckr_rhead = head;
return head;
error:
SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
"rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
ckr_rtail, head, ring->ring_tail);
skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
return kring->ckr_num_slots;
}
/*
* Nexus-specific kr_rxsync_prologue() callback.
*/
int
kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
struct proc *p)
{
#pragma unused(ch, p)
slot_idx_t slot_idx = kring->ckr_rhead;
uint32_t nfree = 0;
ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
/*
* Iterating through the slots just read by user-space;
* ckr_rhead -> ring_head
*/
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __kern_quantum *kqum = ksd->sd_qum;
ASSERT(KSD_VALID_METADATA(ksd));
/* # of new bytes transferred */
*byte_count += kqum->qum_len;
/* detach and free the packet */
(void) KR_SLOT_DETACH_METADATA(kring, ksd);
ASSERT(nfree < kring->ckr_num_slots);
kring->ckr_scratch[nfree++] = (uint64_t)kqum;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
if (nfree > 0) {
pp_free_packet_batch(kring->ckr_pp,
&kring->ckr_scratch[0], nfree);
}
/*
* Update userspace channel statistics of # readable bytes
* subtract byte counts from slots just given back to the kernel.
*/
if (kring->ckr_ready_bytes < *byte_count) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
"(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
kring->ckr_rtail, kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
return -1;
}
kring->ckr_ready_bytes -= *byte_count;
return 0;
}
/*
* Nexus-specific kr_rxsync_prologue() callback - no detach variant.
*/
int
kr_rxprologue_nodetach(struct kern_channel *ch,
struct __kern_channel_ring *kring, const slot_idx_t head,
uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
{
#pragma unused(ch, p)
slot_idx_t slot_idx = kring->ckr_rhead;
ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
/*
* Iterating through the slots just read by user-space;
* ckr_rhead -> ring_head
*/
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __kern_quantum *kqum = ksd->sd_qum;
ASSERT(KSD_VALID_METADATA(ksd));
/* # of new bytes transferred */
*byte_count += kqum->qum_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
/*
* Update userspace channel statistics of # readable bytes
* subtract byte counts from slots just given back to the kernel.
*/
if (kring->ckr_ready_bytes < *byte_count) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
"(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
kring->ckr_rtail, kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
#if (DEVELOPMENT || DEBUG)
if (kr_disable_panic_on_sync_err == 0) {
panic("kr(0x%llx), inconsistent, head %u, ready %llu, "
"cnt %u", SK_KVA(kring), head,
kring->ckr_ready_bytes, *byte_count);
/* NOTREACHED */
__builtin_unreachable();
}
#else /* (DEVELOPMENT || DEBUG) */
return -1;
#endif /* !(DEVELOPMENT || DEBUG) */
}
kring->ckr_ready_bytes -= *byte_count;
return 0;
}
/*
* Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
*/
int
kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
struct proc *p)
{
#pragma unused(ch, p)
slot_idx_t slot_idx = kring->ckr_rhead;
ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
/*
* Iterating through the slots just read by user-space;
* ckr_rhead -> ring_head
*/
while (slot_idx != head) {
struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
/*
* This is a user facing ring opting in for the user packet
* pool mode, so ensure that the user has detached packet
* from slot.
*/
ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
if (SD_VALID_METADATA(usd)) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
"detached md %u kh %u kt %u | rh %u rt %u |"
" h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name,
SK_KVA(kring), slot_idx, usd->sd_md_idx,
kring->ckr_khead, kring->ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
return -1;
}
*byte_count += usd->sd_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
/*
* update userspace channel statistics of # readable bytes
* subtract byte counts from slots just given back to the kernel
*/
if (kring->ckr_ready_bytes < *byte_count) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
"(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
kring->ckr_rtail, kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
return -1;
}
kring->ckr_ready_bytes -= *byte_count;
return 0;
}
/*
* Validate parameters in the RX/ALLOC/EVENT ring/kring.
* For a valid configuration,
* khead <= head <= tail <= ktail
*
* We only consider head.
* khead and ktail are reliable.
*/
#define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh) do { \
slot_idx_t _n = (_kring)->ckr_num_slots; \
/* kernel sanity checks */ \
NM_FAIL_ON((_kh) >= _n || (_kt) >= _n, \
SKYWALK_KILL_REASON_BASIC_SANITY); \
/* user sanity checks */ \
if ((_kt) >= (_kh)) { \
/* want khead <= head <= ktail */ \
NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt), \
SKYWALK_KILL_REASON_HEAD_OOB); \
} else { \
/* we need head outside ktail..khead */ \
NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt), \
SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
} \
NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail, \
SKYWALK_KILL_REASON_TAIL_MISMATCH); \
} while (0)
/*
* Validate parameters in the ring/kring on entry for *_rxsync().
* Returns ring->ring_head if ok, kring->ckr_num_slots on error,
* in order to force a reinit.
*/
slot_idx_t
kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
struct proc *p)
{
#pragma unused(ch)
struct __user_channel_ring *ring = kring->ckr_ring;
slot_idx_t ckr_khead, ckr_ktail;
slot_idx_t head;
uint32_t byte_count = 0;
uint64_t err_reason = 0;
int slot_count;
VERIFY(sk_is_sync_protected());
/* assert that this routine is only called for user facing rings */
ASSERT(!KR_KERNEL_ONLY(kring));
ASSERT(kring->ckr_usds != NULL);
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
ring->ring_head, ring->ring_tail);
/*
* Before storing the new values, we should check they do not
* move backwards. However:
* - head is not an issue because the previous value is khead;
* - cur could in principle go back, however it does not matter
* because we are processing a brand new rxsync()
*/
head = ring->ring_head; /* read only once */
_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
/* # of reclaimed slots */
slot_count = head - kring->ckr_rhead;
if (slot_count < 0) {
slot_count += kring->ckr_num_slots;
}
/*
* Invoke nexus-specific RX prologue callback, which may detach
* and free any consumed packets. Configured in na_kr_create().
*/
if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
kring, head, &byte_count, &err_reason, p) != 0)) {
goto error;
}
/* update the user's view of slots & bytes transferred */
kr_update_user_stats(kring, slot_count, byte_count);
/* update the kernel view of ring */
kring->ckr_rhead = head;
return head;
error:
SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
CKRF_BITS, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
ring->ring_head, ring->ring_tail);
skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
return kring->ckr_num_slots;
}
/*
* Validate parameters on the ring/kring on entry for *_alloc_sync().
* Returns ring->ring_head if ok, kring->ckr_num_slots on error,
* in order to force a reinit.
*/
slot_idx_t
kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
{
struct __user_channel_ring *ring = kring->ckr_ring;
slot_idx_t ckr_khead, ckr_ktail;
slot_idx_t head;
uint64_t err_reason = 0;
VERIFY(sk_is_sync_protected());
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
head = ring->ring_head;
SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
head, ring->ring_tail);
/*
* Before storing the new values, we should check they do not
* move backwards. However, head is not an issue because the
* previous value is khead;
*/
_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
/* update the kernel view of ring */
kring->ckr_rhead = head;
return head;
error:
SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
CKRF_BITS, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
ring->ring_head, ring->ring_tail);
skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
return kring->ckr_num_slots;
}
/*
* Nexus-specific kr_txsync_finalize() callback.
*/
void
kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, struct proc *p)
{
#pragma unused(ch)
struct kern_pbufpool *pp = kring->ckr_pp;
slot_idx_t slot_idx;
uint32_t ph_cnt, i = 0;
int32_t ph_needed;
int err;
ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
/* use khead value from pre-sync time */
slot_idx = kring->ckr_khead_pre;
ph_needed = head - slot_idx;
if (ph_needed < 0) {
ph_needed += kring->ckr_num_slots;
}
if (ph_needed == 0) {
return;
}
ph_cnt = (uint32_t)ph_needed;
err = kern_pbufpool_alloc_batch(pp, 1, kring->ckr_scratch, &ph_cnt);
VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
/* recycle the transferred packets */
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
kern_packet_t ph;
if (KSD_VALID_METADATA(ksd)) {
goto next_slot;
}
ph = kring->ckr_scratch[i];
ASSERT(ph != 0);
kring->ckr_scratch[i] = 0;
++i;
/*
* Since this packet is freshly allocated and we need
* to have the flag set for the attach to succeed,
* just set it here rather than calling
* __packet_finalize().
*/
SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
kr_externalize_metadata_internal(kring, pp->pp_max_frags,
SK_PTR_ADDR_KQUM(ph), p);
next_slot:
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
if (i != ph_cnt) {
kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
ph_cnt - i);
}
}
/*
* Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
*/
void
kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t head, struct proc *p)
{
#pragma unused(ch, p)
slot_idx_t slot_idx;
uint32_t nfree = 0;
ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
/* use khead value from pre-sync time */
slot_idx = kring->ckr_khead_pre;
/* recycle the transferred packets */
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
if (KSD_VALID_METADATA(ksd)) {
/* detach and free the packet */
struct __kern_quantum *kqum = ksd->sd_qum;
(void) KR_SLOT_DETACH_METADATA(kring, ksd);
ASSERT(nfree < kring->ckr_num_slots);
kring->ckr_scratch[nfree++] = (uint64_t)kqum;
}
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
if (__probable(nfree > 0)) {
pp_free_packet_batch(kring->ckr_pp,
&kring->ckr_scratch[0], nfree);
}
}
/*
* Update kring and ring at the end of txsync.
*/
void
kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
struct proc *p)
{
slot_idx_t ckr_khead, ckr_ktail;
uint32_t slot_size;
int32_t slot_diff;
VERIFY(sk_is_sync_protected());
/* assert that this routine is only called for user facing rings */
ASSERT(!KR_KERNEL_ONLY(kring));
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
/*
* update userspace-facing channel statistics (# writable bytes/slots)
*
* Since the ring might be dynamically allocated, we can't rely on the
* tail pointer to calculate free TX space (the tail might be sitting
* at the edge of allocated ring space but be able to be pushed over
* into unallocated ring space).
*
* Instead, calculate free TX space by looking at what slots are
* available to the kernel for TX, and subtracting that from the total
* number of possible slots. This is effectively what userspace can
* write to.
*/
slot_size = PP_BUF_SIZE_DEF(kring->ckr_pp);
slot_diff = kring->ckr_rhead - ckr_khead;
if (slot_diff < 0) {
slot_diff += kring->ckr_num_slots;
}
slot_diff = kring->ckr_lim - slot_diff;
kring->ckr_ready_slots = slot_diff;
kring->ckr_ready_bytes = slot_diff * slot_size;
/*
* Invoke nexus-specific TX finalize callback, which may recycle any
* transferred packets and/or externalize new ones. Some nexus don't
* have any callback set. Configured in na_kr_create().
*/
if (kring->ckr_finalize != NULL) {
kring->ckr_finalize(ch, kring, ckr_khead, p);
}
/* update ring tail/khead to what the kernel knows */
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
kring->ckr_rtail = ckr_ktail;
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
}
/*
* Nexus-specific kr_rxsync_finalize() callback.
*/
void
kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t tail, struct proc *p)
{
#pragma unused(ch)
const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
slot_idx_t slot_idx = kring->ckr_rtail;
uint32_t byte_count = 0;
while (slot_idx != tail) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __kern_quantum *kqum = ksd->sd_qum;
/*
* nexus provider should never leave an empty slot on rx ring.
*/
VERIFY(kqum != NULL);
kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
byte_count += kqum->qum_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
kring->ckr_ready_bytes += byte_count;
/* just recalculate slot count using pointer arithmetic */
int32_t slot_diff = tail - kring->ckr_rhead;
if (slot_diff < 0) {
slot_diff += kring->ckr_num_slots;
}
kring->ckr_ready_slots = slot_diff;
#if CONFIG_NEXUS_NETIF
/*
* If this is a channel opened directly to the netif nexus, provide
* it feedbacks on the number of packets and bytes consumed. This
* will drive the receive mitigation strategy.
*/
if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
slot_diff != 0 && byte_count != 0) {
kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
}
#endif /* CONFIG_NEXUS_NETIF */
}
/*
* Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
*/
void
kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
const slot_idx_t tail, struct proc *p)
{
const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
slot_idx_t slot_idx = kring->ckr_rtail;
struct kern_pbufpool *pp = kring->ckr_pp;
uint32_t byte_count = 0;
PP_LOCK(pp);
while (slot_idx != tail) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
struct __kern_quantum *kqum = ksd->sd_qum;
/*
* nexus provider should never leave an empty slot on rx ring.
*/
VERIFY(kqum != NULL);
/*
* The channel is operating in packet allocator
* mode, so add packet to the allocated list.
*/
pp_insert_upp_locked(pp, kqum, ch->ch_pid);
KSD_DETACH_METADATA(ksd);
/* To calculate ckr_ready_bytes by kr_rxsync_prologue */
USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
byte_count += kqum->qum_len;
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
PP_UNLOCK(pp);
kring->ckr_ready_bytes += byte_count;
/* just recalculate slot count using pointer arithmetic */
int32_t slot_diff = tail - kring->ckr_rhead;
if (slot_diff < 0) {
slot_diff += kring->ckr_num_slots;
}
kring->ckr_ready_slots = slot_diff;
#if CONFIG_NEXUS_NETIF
/*
* If this is a channel opened directly to the netif nexus, provide
* it feedbacks on the number of packets and bytes consumed. This
* will drive the receive mitigation strategy.
*/
if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
slot_diff != 0 && byte_count != 0) {
kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
}
#endif /* CONFIG_NEXUS_NETIF */
}
/*
* Update kring and ring at the end of rxsync
*/
void
kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
struct proc *p)
{
#pragma unused(ch, p)
slot_idx_t ckr_khead, ckr_ktail;
VERIFY(sk_is_sync_protected());
/* assert that this routine is only called for user facing rings */
ASSERT(!KR_KERNEL_ONLY(kring));
ASSERT(kring->ckr_usds != NULL);
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
/*
* Invoke nexus-specific RX finalize callback; set in na_kr_create().
*/
if (kring->ckr_finalize != NULL) {
kring->ckr_finalize(ch, kring, ckr_ktail, p);
}
/* update ring tail/khead to what the kernel knows */
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
kring->ckr_rtail = ckr_ktail;
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
}
void
kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
{
#pragma unused(p)
slot_idx_t ckr_khead, ckr_ktail;
VERIFY(sk_is_sync_protected());
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
/* update ring tail/khead to what the kernel knows */
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
kring->ckr_rtail = ckr_ktail;
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
*(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
kring->ckr_alloc_ws;
SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u | ws %u",
sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
}
void
kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
{
#pragma unused(p)
slot_idx_t ckr_khead, ckr_ktail;
VERIFY(sk_is_sync_protected());
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
/* update ring tail/khead to what the kernel knows */
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
kring->ckr_rtail = ckr_ktail;
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
}
slot_idx_t
kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
{
struct __user_channel_ring *ring = kring->ckr_ring;
slot_idx_t ckr_khead, ckr_ktail;
slot_idx_t head, slot_idx;
uint64_t err_reason = 0;
ASSERT(kring->ckr_tx == NR_EV);
VERIFY(sk_is_sync_protected());
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
head = ring->ring_head;
SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
head, ring->ring_tail);
/*
* Before storing the new values, we should check they do not
* move backwards. However, head is not an issue because the
* previous value is khead;
*/
_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
/*
* Iterating through the slots just read by user-space;
* ckr_rhead -> ring_head
*/
slot_idx = kring->ckr_rhead;
while (slot_idx != head) {
struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
/*
* ensure that the user has detached packet from slot.
*/
VERIFY(!KSD_VALID_METADATA(ksd));
if (__improbable(SD_VALID_METADATA(usd))) {
SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
"detached md %u kh %u kt %u | rh %u rt %u |"
" h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name,
SK_KVA(kring), slot_idx, usd->sd_md_idx,
ckr_khead, ckr_ktail, kring->ckr_rhead,
kring->ckr_rtail, ring->ring_head,
ring->ring_tail);
err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
goto error;
}
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
/* update the kernel view of ring */
kring->ckr_rhead = head;
return head;
error:
SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
CKRF_BITS, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
ring->ring_head, ring->ring_tail);
skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
return kring->ckr_num_slots;
}
void
kr_event_sync_finalize(struct kern_channel *ch,
struct __kern_channel_ring *kring, struct proc *p)
{
#pragma unused(ch)
struct kern_pbufpool *pp = kring->ckr_pp;
const uint32_t maxfrags = pp->pp_max_frags;
slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
struct __kern_slot_desc *ksd;
struct __user_slot_desc *usd;
struct __kern_quantum *kqum;
VERIFY(sk_is_sync_protected());
/* assert that this routine is only called for user facing rings */
ASSERT(!KR_KERNEL_ONLY(kring));
ASSERT(kring->ckr_usds != NULL);
ASSERT(kring->ckr_tx == NR_EV);
/* read these once and use local copies */
ckr_khead = kring->ckr_khead;
ckr_ktail = kring->ckr_ktail;
ckr_rhead = kring->ckr_rhead;
slot_idx_t slot_idx = kring->ckr_rtail;
PP_LOCK(pp);
while (slot_idx != ckr_ktail) {
ksd = KR_KSD(kring, slot_idx);
usd = KR_USD(kring, slot_idx);
kqum = ksd->sd_qum;
/*
* Add packet to the allocated list of user packet pool.
*/
pp_insert_upp_locked(pp, kqum, ch->ch_pid);
KSD_DETACH_METADATA(ksd);
kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
}
PP_UNLOCK(pp);
/* just recalculate slot count using pointer arithmetic */
int32_t slot_diff = ckr_ktail - ckr_rhead;
if (slot_diff < 0) {
slot_diff += kring->ckr_num_slots;
}
kring->ckr_ready_slots = slot_diff;
/* update ring tail/khead to what the kernel knows */
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
kring->ckr_rtail = ckr_ktail;
*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
"rh %u rt %u | h %u t %u", sk_proc_name_address(p),
sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
kring->ckr_rhead, kring->ckr_rtail,
kring->ckr_ring->ring_head,
kring->ckr_ring->ring_tail);
}
#undef NM_FAIL_ON
void
kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
slot_idx_t index)
{
const slot_idx_t lim = kring->ckr_lim;
slot_idx_t next_index = SLOT_NEXT(index, lim);
kring->ckr_khead = next_index;
/* reclaim */
kring->ckr_ktail = index;
}
/*
* *************************************************************************
* Checks on packet header offsets in kr_internalize_metadata
* *************************************************************************
*
* +----------+------------------------------+----------------------------+
* | | NEXUS_META_SUBTYPE_RAW | NEXUS_META_SUBTYPE_PAYLOAD |
* |----------+------------------------------+----------------------------+
* | buflet | (bdoff + len) <= dlim | (bdoff + len) <= dlim |
* |----------+------------------------------+----------------------------+
* | headroom | hr == bdoff && hr < bdlim | hr == 0 && bdoff == 0 |
* |----------+------------------------------+----------------------------+
* | l2_len | hr + l2_len < bdim | l2_len == 0 |
* |----------+------------------------------+----------------------------+
*/
int
kr_internalize_metadata(struct kern_channel *ch,
struct __kern_channel_ring *kring, const uint32_t maxfrags,
struct __kern_quantum *kqum, struct proc *p)
{
#pragma unused(kring, maxfrags, p)
struct __user_buflet *ubuf, *pubuf; /* user buflet */
struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
struct __user_quantum *uqum; /* user source */
struct __user_packet *upkt;
struct __kern_packet *kpkt;
const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
uint32_t len = 0, bdoff, bdlim;
uint16_t bcnt = 0, bmax, i;
boolean_t dropped;
int err = 0;
/*
* Verify that the quantum/packet belongs to the same pp as
* the one used by the adapter, i.e. the packet must have
* been allocated from the same pp and attached to the kring.
*/
ASSERT(kqum->qum_pp == kring->ckr_pp);
_CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
_CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
upkt = SK_PTR_ADDR_UPKT(uqum);
kpkt = SK_PTR_ADDR_KPKT(kqum);
DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
struct __kern_packet *, kpkt, struct __user_packet *, upkt);
SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx",
sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
SK_KVA(uqum), SK_KVA(kqum));
/* check if it's dropped before we internalize it */
dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
/*
* Internalize common quantum metadata.
*
* For packet metadata, we trust the kernel copy for the buflet
* count and limit; any mismatch on the user copy will cause
* us to drop this packet.
*/
_QUM_INTERNALIZE(uqum, kqum);
/* if marked as dropped, don't bother going further */
if (__improbable(dropped)) {
SK_ERR("%s(%d) kring 0x%llx dropped",
sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring));
err = ERANGE;
goto done;
}
switch (md_type) {
case NEXUS_META_TYPE_PACKET:
/*
* Internalize common packet metadata.
*/
_PKT_INTERNALIZE(upkt, kpkt);
switch (md_subtype) {
case NEXUS_META_SUBTYPE_PAYLOAD:
/* sanitize link layer fields for payload mode */
kpkt->pkt_link_flags = 0;
break;
default:
break;
}
if (__probable(ch != NULL)) {
_UUID_COPY(kpkt->pkt_flowsrc_id,
ch->ch_info->cinfo_ch_id);
}
bcnt = upkt->pkt_bufs_cnt;
bmax = kpkt->pkt_bufs_max;
ASSERT(bmax == maxfrags);
if (__improbable((bcnt == 0) || (bcnt > bmax) ||
(upkt->pkt_bufs_max != bmax))) {
SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
err = ERANGE;
goto done;
}
break;
case NEXUS_META_TYPE_QUANTUM:
ASSERT(maxfrags == 1);
bcnt = bmax = 1;
break;
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
ASSERT(bcnt != 0);
ubuf = pubuf = NULL;
kbuf = pkbuf = NULL;
/*
* Validate and internalize buflets.
*/
for (i = 0; i < bcnt; i++) {
_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
_CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
_CASSERT(offsetof(struct __kern_quantum, qum_com) == 0);
PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
ASSERT(kbuf != NULL);
if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
ubuf = __DECONST(struct __user_buflet *,
((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
} else {
ASSERT(i == 0);
ubuf = __DECONST(struct __user_buflet *,
&uqum->qum_buf[0]);
}
ASSERT(ubuf != NULL);
ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
ASSERT(kbuf->buf_dlim == _BUF_DLIM(kbuf, kqum->qum_pp));
ASSERT(kbuf->buf_addr != 0);
/*
* For now, user-facing pool does not support shared
* buffer, since otherwise the ubuf and kbuf buffer
* indices would not match. Assert this is the case.
*/
ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
kbuf->buf_dlen = ubuf->buf_dlen;
kbuf->buf_doff = ubuf->buf_doff;
/*
* kernel and user metadata use the same object index
* also checks the sanity of buflet data offset and length
*/
if (__improbable(!BUF_IN_RANGE(kbuf) ||
ubuf->buf_idx != kbuf->buf_idx)) {
kbuf->buf_dlen = kbuf->buf_doff = 0;
SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx);
err = ERANGE;
goto done;
}
/* save data offset from the first buflet */
if (pkbuf == NULL) {
bdoff = kbuf->buf_doff;
}
/* all good to go */
len += kbuf->buf_dlen;
pubuf = ubuf;
pkbuf = kbuf;
}
_CASSERT(offsetof(struct __kern_packet, pkt_length) ==
offsetof(struct __kern_packet, pkt_qum.qum_len));
if (__improbable(kpkt->pkt_length != len)) {
SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), kpkt->pkt_length, len);
err = ERANGE;
goto done;
}
if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) {
bdlim = PP_BUF_SIZE_DEF(kqum->qum_pp);
switch (md_subtype) {
case NEXUS_META_SUBTYPE_RAW:
/*
* For a raw packet from user space we need to
* validate that headroom is sane and is in the
* first buflet.
*/
if (__improbable(kpkt->pkt_headroom != bdoff)) {
SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), kpkt->pkt_headroom, bdoff);
err = ERANGE;
goto done;
}
if (__improbable(kpkt->pkt_headroom +
kpkt->pkt_l2_len >= bdlim)) {
SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
err = ERANGE;
goto done;
}
break;
case NEXUS_META_SUBTYPE_PAYLOAD:
/*
* For a payload packet from user space we need
* to validate that payload starts from 0 and L2
* length is 0.
*/
if (__improbable((kpkt->pkt_headroom != 0) ||
(kpkt->pkt_l2_len != 0))) {
SK_ERR("%s(%d) kring 0x%llx bad headroom "
"payload subtype %d headroom %d l2len %d",
sk_proc_name_address(p), sk_proc_pid(p),
SK_KVA(kring), SK_PTR_SUBTYPE(kpkt),
kpkt->pkt_headroom, kpkt->pkt_l2_len);
err = ERANGE;
goto done;
}
break;
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
/* validate checksum offload properties */
if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
uint16_t start = kpkt->pkt_csum_tx_start_off;
uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
if (__improbable(start > stuff ||
start > kpkt->pkt_length ||
(stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
"len %u", sk_proc_name_address(p),
sk_proc_pid(p), kpkt->pkt_csum_flags,
start, stuff, kpkt->pkt_length);
err = ERANGE;
goto done;
}
} else {
kpkt->pkt_csum_tx_start_off = 0;
kpkt->pkt_csum_tx_stuff_off = 0;
}
*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
}
done:
if (__probable(err == 0)) {
kqum->qum_len = len;
kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
} else {
kqum->qum_len = 0;
kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
}
return err;
}
__attribute__((always_inline))
static inline void
kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
{
#pragma unused(kring, maxfrags, p)
struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
struct __user_buflet *ubuf, *pubuf; /* user buflet */
struct __user_quantum *uqum; /* user destination */
struct __user_packet *upkt;
struct __kern_packet *kpkt;
const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
uint32_t len = 0;
uint16_t bcnt = 0, bmax, i;
/*
* Verify that the quantum/packet belongs to the same pp as
* the one used by the adapter, i.e. the packet must have
* been allocated from the same pp and attached to the kring.
*/
ASSERT(kqum->qum_pp == kring->ckr_pp);
ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
_CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
_CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
upkt = SK_PTR_ADDR_UPKT(uqum);
kpkt = SK_PTR_ADDR_KPKT(kqum);
DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
struct __kern_packet *, kpkt, struct __user_packet *, upkt);
SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx",
sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
SK_KVA(kqum), SK_KVA(uqum));
/*
* Externalize common quantum metadata.
*/
_QUM_EXTERNALIZE(kqum, uqum);
switch (md_type) {
case NEXUS_META_TYPE_PACKET: {
bcnt = kpkt->pkt_bufs_cnt;
bmax = kpkt->pkt_bufs_max;
ASSERT(bmax == maxfrags);
ASSERT(bcnt <= bmax);
/*
* Externalize common packet metadata.
*/
_PKT_EXTERNALIZE(kpkt, upkt);
/* sanitize buflet count and limit (deconst) */
_CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
_CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
switch (md_subtype) {
case NEXUS_META_SUBTYPE_PAYLOAD:
/* sanitize link layer fields for payload mode */
upkt->pkt_headroom = 0;
upkt->pkt_link_flags = 0;
break;
default:
break;
}
break;
}
case NEXUS_META_TYPE_QUANTUM:
ASSERT(maxfrags == 1);
bcnt = bmax = 1;
break;
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
ASSERT(bcnt != 0);
/*
* special handling to externalize empty packet buflet.
*/
kbuf = &kpkt->pkt_qum.qum_buf[0];
if (kbuf->buf_addr == 0) {
ubuf = __DECONST(struct __user_buflet *,
&kpkt->pkt_qum.qum_user->qum_buf[0]);
UBUF_INIT(kbuf, ubuf);
}
kbuf = pkbuf = NULL;
ubuf = pubuf = NULL;
/*
* Externalize buflets.
*/
for (i = 0; i < bcnt; i++) {
_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
ASSERT(kbuf != NULL);
if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
ubuf = __DECONST(struct __user_buflet *,
((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
} else {
ASSERT(i == 0);
ubuf = __DECONST(struct __user_buflet *,
&kpkt->pkt_qum.qum_user->qum_buf[0]);
}
ASSERT(ubuf != NULL);
ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
ASSERT(BUF_IN_RANGE(kbuf));
KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
/* all good to go */
len += kbuf->buf_dlen;
pkbuf = kbuf;
pubuf = ubuf;
}
uqum->qum_len = len;
uqum->qum_qflags |= QUM_F_FINALIZED;
/*
* XXX: adi@apple.com -- do this during reclaim instead?
*/
kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
}
void
kr_externalize_metadata(struct __kern_channel_ring *kring,
const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
{
kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
}