/* * Copyright (c) 2016-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #define _FN_KPRINTF #include /* for PE_parse_boot_argn */ #include /* for OSBacktrace */ #include /* for assert_wait */ #include /* * Memory allocator with per-CPU caching (magazines), derived from the kmem * magazine concept and implementation as described in the following paper: * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf * * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights * reserved. Use is subject to license terms. * * This derivative differs from the original kmem slab allocator, in that: * * a) There is always a discrete bufctl per object, even for small sizes. * This increases the overhead, but is necessary as Skywalk objects * coming from the slab may be shared (RO or RW) with userland; therefore * embedding the KVA pointer linkage in freed objects is a non-starter. * * b) Writing patterns to the slab at slab creation or destruction time * (when debugging is enabled) is not implemented, as the object may * be shared (RW) with userland and thus we cannot panic upon pattern * mismatch episodes. This can be relaxed so that we conditionally * verify the pattern for kernel-only memory. * * This derivative also differs from Darwin's mcache allocator (which itself * is a derivative of the original kmem slab allocator), in that: * * 1) The slab layer is internal to skmem_cache, unlike mcache's external * slab layer required to support mbufs. skmem_cache also supports * constructing and deconstructing objects, while mcache does not. * This brings skmem_cache's model closer to that of the original * kmem slab allocator. * * 2) mcache allows for batch allocation and free by way of chaining the * objects together using a linked list. This requires using a part * of the object to act as the linkage, which is against Skywalk's * requirements of not exposing any KVA pointer to userland. Although * this is supported by skmem_cache, chaining is only possible if the * region is not mapped to userland. That implies that kernel-only * objects can be chained provided the cache is created with batching * mode enabled, and that the object is large enough to contain the * skmem_obj structure. * * In other words, skmem_cache is a hybrid of a hybrid custom allocator that * implements features that are required by Skywalk. In addition to being * aware of userland access on the buffers, in also supports mirrored backend * memory regions. This allows a cache to manage two independent memory * regions, such that allocating/freeing an object from/to one results in * allocating/freeing a shadow object in another, thus guaranteeing that both * objects share the same lifetime. */ static uint32_t ncpu; /* total # of initialized CPUs */ static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr); static struct thread *skmem_lock_owner = THREAD_NULL; static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab"); static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot"); static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache"); #define SKMEM_CACHE_LOCK() do { \ lck_mtx_lock(&skmem_cache_lock); \ skmem_lock_owner = current_thread(); \ } while (0) #define SKMEM_CACHE_UNLOCK() do { \ skmem_lock_owner = THREAD_NULL; \ lck_mtx_unlock(&skmem_cache_lock); \ } while (0) #define SKMEM_CACHE_LOCK_ASSERT_HELD() \ LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED) #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD() \ LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKM_SLAB_LOCK(_skm) \ lck_mtx_lock(&(_skm)->skm_sl_lock) #define SKM_SLAB_LOCK_ASSERT_HELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED) #define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKM_SLAB_UNLOCK(_skm) \ lck_mtx_unlock(&(_skm)->skm_sl_lock) #define SKM_DEPOT_LOCK(_skm) \ lck_mtx_lock(&(_skm)->skm_dp_lock) #define SKM_DEPOT_LOCK_SPIN(_skm) \ lck_mtx_lock_spin(&(_skm)->skm_dp_lock) #define SKM_DEPOT_CONVERT_LOCK(_skm) \ lck_mtx_convert_spin(&(_skm)->skm_dp_lock) #define SKM_DEPOT_LOCK_TRY(_skm) \ lck_mtx_try_lock(&(_skm)->skm_dp_lock) #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED) #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKM_DEPOT_UNLOCK(_skm) \ lck_mtx_unlock(&(_skm)->skm_dp_lock) #define SKM_RESIZE_LOCK(_skm) \ lck_mtx_lock(&(_skm)->skm_rs_lock) #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED) #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm) \ LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKM_RESIZE_UNLOCK(_skm) \ lck_mtx_unlock(&(_skm)->skm_rs_lock) #define SKM_CPU_LOCK(_cp) \ lck_mtx_lock(&(_cp)->cp_lock) #define SKM_CPU_LOCK_SPIN(_cp) \ lck_mtx_lock_spin(&(_cp)->cp_lock) #define SKM_CPU_CONVERT_LOCK(_cp) \ lck_mtx_convert_spin(&(_cp)->cp_lock) #define SKM_CPU_LOCK_ASSERT_HELD(_cp) \ LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED) #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp) \ LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKM_CPU_UNLOCK(_cp) \ lck_mtx_unlock(&(_cp)->cp_lock) #define SKM_ZONE_MAX 256 static struct zone *skm_zone; /* zone for skmem_cache */ static struct skmem_cache *skmem_slab_cache; /* cache for skmem_slab */ static struct skmem_cache *skmem_bufctl_cache; /* cache for skmem_bufctl */ static unsigned int bc_size; /* size of bufctl */ /* * Magazine types (one per row.) * * The first column defines the number of objects that the magazine can hold. * Using that number, we derive the effective number: the aggregate count of * object pointers, plus 2 pointers (skmem_mag linkage + magazine type). * This would result in an object size that is aligned on the CPU cache * size boundary; the exception to this is the KASAN mode where the size * would be larger due to the redzone regions. * * The second column defines the alignment of the magazine. Because each * magazine is used at the CPU-layer cache, we need to ensure there is no * false sharing across the CPUs, and align the magazines to the maximum * cache alignment size, for simplicity. The value of 0 may be used to * indicate natural pointer size alignment. * * The third column defines the starting magazine type for a given cache, * determined at the cache's creation time based on its chunk size. * * The fourth column defines the magazine type limit for a given cache. * Magazine resizing will only occur if the chunk size is less than this. */ static struct skmem_magtype skmem_magtype[] = { #if defined(__LP64__) { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8, .mt_cache = NULL, .mt_cname = "" }, { .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0, .mt_cache = NULL, .mt_cname = "" }, #else /* !__LP64__ */ { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0, .mt_cache = NULL, .mt_cname = "" }, #endif /* !__LP64__ */ }; /* * Hash table bounds. Start with the initial value, and rescale up to * the specified limit. Ideally we don't need a limit, but in practice * this helps guard against runaways. These values should be revisited * in future and be adjusted as needed. */ #define SKMEM_CACHE_HASH_INITIAL 64 /* initial hash table size */ #define SKMEM_CACHE_HASH_LIMIT 8192 /* hash table size limit */ #define SKMEM_CACHE_HASH_INDEX(_a, _s, _m) (((_a) >> (_s)) & (_m)) #define SKMEM_CACHE_HASH(_skm, _buf) \ (&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \ (_skm)->skm_hash_shift, (_skm)->skm_hash_mask)]) /* * The last magazine type. */ static struct skmem_magtype *skmem_cache_magsize_last; static TAILQ_HEAD(, skmem_cache) skmem_cache_head; static boolean_t skmem_cache_ready; static int skmem_slab_alloc_locked(struct skmem_cache *, struct skmem_obj_info *, struct skmem_obj_info *, uint32_t); static void skmem_slab_free_locked(struct skmem_cache *, void *); static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *, struct skmem_obj_info *, struct skmem_obj_info *, uint32_t); static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *); static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t); static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *); static int skmem_magazine_ctor(struct skmem_obj_info *, struct skmem_obj_info *, void *, uint32_t); static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *, int); static uint32_t skmem_depot_batch_alloc(struct skmem_cache *, struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t); static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *, uint32_t *, struct skmem_mag *); static void skmem_depot_ws_update(struct skmem_cache *); static void skmem_depot_ws_zero(struct skmem_cache *); static void skmem_depot_ws_reap(struct skmem_cache *); static void skmem_cache_magazine_purge(struct skmem_cache *); static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t); static void skmem_cache_magazine_resize(struct skmem_cache *); static void skmem_cache_hash_rescale(struct skmem_cache *); static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int); static void skmem_cpu_batch_reload(struct skmem_cpu_cache *, struct skmem_mag *, int); static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t), uint32_t); static void skmem_cache_reclaim(struct skmem_cache *, uint32_t); static void skmem_cache_reap_start(void); static void skmem_cache_reap_done(void); static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t); static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t); static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t); static void skmem_cache_resize_exit(struct skmem_cache *); static void skmem_audit_bufctl(struct skmem_bufctl *); static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *); static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS; SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache", "Skywalk cache statistics"); static volatile uint32_t skmem_cache_reaping; static thread_call_t skmem_cache_reap_tc; static thread_call_t skmem_cache_update_tc; extern kern_return_t thread_terminate(thread_t); extern unsigned int ml_wait_max_cpus(void); #define SKMEM_DEBUG_NOMAGAZINES 0x1 /* disable magazines layer */ #define SKMEM_DEBUG_AUDIT 0x2 /* audit transactions */ #define SKMEM_DEBUG_MASK (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT) #if DEBUG static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT; #else /* !DEBUG */ static uint32_t skmem_debug = 0; #endif /* !DEBUG */ static uint32_t skmem_clear_min = 0; /* clear on free threshold */ #define SKMEM_CACHE_UPDATE_INTERVAL 11 /* 11 seconds */ static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL; #define SKMEM_DEPOT_CONTENTION 3 /* max failed trylock per interval */ static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION; /* * Too big a value will cause overflow and thus trip the assertion; the * idea here is to set an upper limit for the time that a particular * thread is allowed to perform retries before we give up and panic. */ #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */ /* * Threshold (in msec) after which we reset the exponential backoff value * back to its (random) initial value. Note that we allow the actual delay * to be at most twice this value. */ #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */ /* * To reduce the likelihood of global synchronization between threads, * we use some random value to start the exponential backoff. */ #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */ #if (DEVELOPMENT || DEBUG) SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval, SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval"); SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention, CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention, SKMEM_DEPOT_CONTENTION, "Depot contention"); static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL; /* * Called by skmem_test_start() to set the update interval. */ void skmem_cache_test_start(uint32_t i) { skmem_cache_update_interval_saved = skmem_cache_update_interval; skmem_cache_update_interval = i; } /* * Called by skmem_test_stop() to restore the update interval. */ void skmem_cache_test_stop(void) { skmem_cache_update_interval = skmem_cache_update_interval_saved; } #endif /* (DEVELOPMENT || DEBUG) */ #define SKMEM_TAG_BUFCTL_HASH "com.apple.skywalk.bufctl.hash" static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH); #define SKMEM_TAG_CACHE_MIB "com.apple.skywalk.cache.mib" static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB); static int __skmem_cache_pre_inited = 0; static int __skmem_cache_inited = 0; /* * Called before skmem_region_init(). */ void skmem_cache_pre_init(void) { vm_size_t skm_size; ASSERT(!__skmem_cache_pre_inited); ncpu = ml_wait_max_cpus(); /* allocate extra in case we need to manually align the pointer */ if (skm_zone == NULL) { skm_size = SKMEM_CACHE_SIZE(ncpu); #if KASAN /* * When KASAN is enabled, the zone allocator adjusts the * element size to include the redzone regions, in which * case we assume that the elements won't start on the * alignment boundary and thus need to do some fix-ups. * These include increasing the effective object size * which adds at least 136 bytes to the original size, * as computed by skmem_region_params_config() above. */ skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX); #endif /* KASAN */ skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX); skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size, ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE); } TAILQ_INIT(&skmem_cache_head); __skmem_cache_pre_inited = 1; } /* * Called after skmem_region_init(). */ void skmem_cache_init(void) { uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size(); struct skmem_magtype *mtp; uint32_t i; _CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL); _CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES); _CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT); _CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT); _CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH); _CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC); _CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE); _CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO); ASSERT(__skmem_cache_pre_inited); ASSERT(!__skmem_cache_inited); PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug)); skmem_debug &= SKMEM_DEBUG_MASK; #if (DEVELOPMENT || DEBUG) PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min, sizeof(skmem_clear_min)); #endif /* (DEVELOPMENT || DEBUG) */ if (skmem_clear_min == 0) { /* zeroing 2 CPU cache lines practically comes for free */ skmem_clear_min = 2 * cpu_cache_line_size; } else { /* round it up to CPU cache line size */ skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min, cpu_cache_line_size); } /* create a cache for buffer control structures */ if (skmem_debug & SKMEM_DEBUG_AUDIT) { bc_size = sizeof(struct skmem_bufctl_audit); skmem_bufctl_cache = skmem_cache_create("bufctl.audit", bc_size, sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0); } else { bc_size = sizeof(struct skmem_bufctl); skmem_bufctl_cache = skmem_cache_create("bufctl", bc_size, sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0); } /* create a cache for slab structures */ skmem_slab_cache = skmem_cache_create("slab", sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0); /* * Go thru the magazine type table and create an cache for each. */ for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) { mtp = &skmem_magtype[i]; if (mtp->mt_align != 0 && ((mtp->mt_align & (mtp->mt_align - 1)) != 0 || mtp->mt_align < (int)cpu_cache_line_size)) { panic("%s: bad alignment %d", __func__, mtp->mt_align); /* NOTREACHED */ __builtin_unreachable(); } (void) snprintf(mtp->mt_cname, sizeof(mtp->mt_cname), "mg.%d", mtp->mt_magsize); /* create an cache for this magazine type */ mtp->mt_cache = skmem_cache_create(mtp->mt_cname, SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align, skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0); /* remember the last magazine type */ skmem_cache_magsize_last = mtp; } VERIFY(skmem_cache_magsize_last != NULL); VERIFY(skmem_cache_magsize_last->mt_minbuf == 0); VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0); /* * Allocate thread calls for cache reap and update operations. */ skmem_cache_reap_tc = thread_call_allocate_with_options(skmem_cache_reap_func, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); skmem_cache_update_tc = thread_call_allocate_with_options(skmem_cache_update_func, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) { panic("%s: thread_call_allocate failed", __func__); /* NOTREACHED */ __builtin_unreachable(); } /* * We're ready; go through existing skmem_cache entries * (if any) and enable the magazines layer for each. */ skmem_cache_applyall(skmem_cache_magazine_enable, 0); skmem_cache_ready = TRUE; /* and start the periodic cache update machinery */ skmem_dispatch(skmem_cache_update_tc, NULL, (skmem_cache_update_interval * NSEC_PER_SEC)); __skmem_cache_inited = 1; } void skmem_cache_fini(void) { struct skmem_magtype *mtp; uint32_t i; if (__skmem_cache_inited) { ASSERT(TAILQ_EMPTY(&skmem_cache_head)); for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) { mtp = &skmem_magtype[i]; skmem_cache_destroy(mtp->mt_cache); mtp->mt_cache = NULL; } skmem_cache_destroy(skmem_slab_cache); skmem_slab_cache = NULL; skmem_cache_destroy(skmem_bufctl_cache); skmem_bufctl_cache = NULL; if (skmem_cache_reap_tc != NULL) { (void) thread_call_cancel_wait(skmem_cache_reap_tc); (void) thread_call_free(skmem_cache_reap_tc); skmem_cache_reap_tc = NULL; } if (skmem_cache_update_tc != NULL) { (void) thread_call_cancel_wait(skmem_cache_update_tc); (void) thread_call_free(skmem_cache_update_tc); skmem_cache_update_tc = NULL; } __skmem_cache_inited = 0; } if (__skmem_cache_pre_inited) { if (skm_zone != NULL) { zdestroy(skm_zone); skm_zone = NULL; } __skmem_cache_pre_inited = 0; } } /* * Create a cache. */ struct skmem_cache * skmem_cache_create(const char *name, size_t bufsize, size_t bufalign, skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim, void *private, struct skmem_region *region, uint32_t cflags) { boolean_t pseudo = (region == NULL); struct skmem_magtype *mtp; struct skmem_cache *skm; void *buf; size_t segsize; size_t chunksize; size_t objsize; size_t objalign; uint32_t i, cpuid; /* enforce 64-bit minimum alignment for buffers */ if (bufalign == 0) { bufalign = SKMEM_CACHE_ALIGN; } bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN); /* enforce alignment to be a power of 2 */ VERIFY(powerof2(bufalign)); if (region == NULL) { struct skmem_region_params srp; /* batching is currently not supported on pseudo regions */ VERIFY(!(cflags & SKMEM_CR_BATCH)); srp = *skmem_get_default(SKMEM_REGION_INTRINSIC); ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO); /* objalign is always equal to bufalign */ srp.srp_align = objalign = bufalign; srp.srp_r_obj_cnt = 1; srp.srp_r_obj_size = (uint32_t)bufsize; skmem_region_params_config(&srp); /* allocate region for intrinsics */ region = skmem_region_create(name, &srp, NULL, NULL, NULL); VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign)); VERIFY(objalign == region->skr_align); #if KASAN /* * When KASAN is enabled, the zone allocator adjusts the * element size to include the redzone regions, in which * case we assume that the elements won't start on the * alignment boundary and thus need to do some fix-ups. * These include increasing the effective object size * which adds at least 16 bytes to the original size, * as computed by skmem_region_params_config() above. */ VERIFY(region->skr_c_obj_size >= (bufsize + sizeof(uint64_t) + bufalign)); #endif /* KASAN */ /* enable magazine resizing by default */ cflags |= SKMEM_CR_DYNAMIC; /* * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg, * even though it's a no-op since the work is done * at the zone layer instead. */ cflags |= SKMEM_CR_CLEARONFREE; } else { objalign = region->skr_align; } ASSERT(region != NULL); ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED)); segsize = region->skr_seg_size; ASSERT(bufalign <= segsize); buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO); #if KASAN /* * In case we didn't get a cache-aligned memory, round it up * accordingly. This is needed in order to get the rest of * structure members aligned properly. It also means that * the memory span gets shifted due to the round up, but it * is okay since we've allocated extra space for this. */ skm = (struct skmem_cache *) P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX); void **pbuf = (void **)((intptr_t)skm - sizeof(void *)); *pbuf = buf; #else /* !KASAN */ /* * We expect that the zone allocator would allocate elements * rounded up to the requested alignment based on the object * size computed in skmem_cache_pre_init() earlier, and * 'skm' is therefore the element address itself. */ skm = buf; #endif /* !KASAN */ VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX)); if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) || (cflags & SKMEM_CR_NOMAGAZINES)) { /* * Either the caller insists that this cache should not * utilize magazines layer, or that the system override * to disable magazines layer on all caches has been set. */ skm->skm_mode |= SKM_MODE_NOMAGAZINES; } else { /* * Region must be configured with enough objects * to take into account objects at the CPU layer. */ ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES)); } if (cflags & SKMEM_CR_DYNAMIC) { /* * Enable per-CPU cache magazine resizing. */ skm->skm_mode |= SKM_MODE_DYNAMIC; } /* region stays around after defunct? */ if (region->skr_mode & SKR_MODE_NOREDIRECT) { skm->skm_mode |= SKM_MODE_NOREDIRECT; } if (cflags & SKMEM_CR_BATCH) { /* * Batch alloc/free involves storing the next object * pointer at the beginning of each object; this is * okay for kernel-only regions, but not those that * are mappable to user space (we can't leak kernel * addresses). */ _CASSERT(offsetof(struct skmem_obj, mo_next) == 0); VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK)); /* batching is currently not supported on pseudo regions */ VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO)); /* validate object size */ VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj)); skm->skm_mode |= SKM_MODE_BATCH; } uuid_generate_random(skm->skm_uuid); (void) snprintf(skm->skm_name, sizeof(skm->skm_name), "%s.%s", SKMEM_CACHE_PREFIX, name); skm->skm_bufsize = bufsize; skm->skm_bufalign = bufalign; skm->skm_objalign = objalign; skm->skm_ctor = ctor; skm->skm_dtor = dtor; skm->skm_reclaim = reclaim; skm->skm_private = private; skm->skm_slabsize = segsize; skm->skm_region = region; /* callee holds reference */ skmem_region_slab_config(region, skm, true); objsize = region->skr_c_obj_size; skm->skm_objsize = objsize; if (pseudo) { /* * Release reference from skmem_region_create() * since skm->skm_region holds one now. */ ASSERT(region->skr_mode & SKR_MODE_PSEUDO); skmem_region_release(region); skm->skm_mode |= SKM_MODE_PSEUDO; skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked; skm->skm_slab_free = skmem_slab_free_pseudo_locked; } else { skm->skm_slab_alloc = skmem_slab_alloc_locked; skm->skm_slab_free = skmem_slab_free_locked; /* auditing was requested? (normal regions only) */ if (skmem_debug & SKMEM_DEBUG_AUDIT) { ASSERT(bc_size == sizeof(struct skmem_bufctl_audit)); skm->skm_mode |= SKM_MODE_AUDIT; } } /* * Clear upon free (to slab layer) as long as the region is * not marked as read-only for kernel, and if the chunk size * is within the threshold or if the caller had requested it. */ if (!(region->skr_mode & SKR_MODE_KREADONLY)) { if (skm->skm_objsize <= skmem_clear_min || (cflags & SKMEM_CR_CLEARONFREE)) { skm->skm_mode |= SKM_MODE_CLEARONFREE; } } chunksize = bufsize; if (bufalign >= SKMEM_CACHE_ALIGN) { chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN); } chunksize = P2ROUNDUP(chunksize, bufalign); if (chunksize > objsize) { panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu", __func__, bufsize, chunksize, objsize); /* NOTREACHED */ __builtin_unreachable(); } ASSERT(chunksize != 0); skm->skm_chunksize = chunksize; lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr); TAILQ_INIT(&skm->skm_sl_partial_list); TAILQ_INIT(&skm->skm_sl_empty_list); /* allocated-address hash table */ skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL; skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT; skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt, skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash); skm->skm_hash_mask = (skm->skm_hash_initial - 1); skm->skm_hash_shift = flsll(chunksize) - 1; for (i = 0; i < (skm->skm_hash_mask + 1); i++) { SLIST_INIT(&skm->skm_hash_table[i].bcb_head); } lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr); /* find a suitable magazine type for this chunk size */ for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) { continue; } skm->skm_magtype = mtp; if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) { skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize; } /* * Initialize the CPU layer. Each per-CPU structure is aligned * on the CPU cache line boundary to prevent false sharing. */ lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr); for (cpuid = 0; cpuid < ncpu; cpuid++) { struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid]; VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX)); lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp, &skmem_lock_attr); ccp->cp_rounds = -1; ccp->cp_prounds = -1; } SKMEM_CACHE_LOCK(); TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link); SKMEM_CACHE_UNLOCK(); SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b", skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS); SK_DF(SK_VERB_MEM_CACHE, " bufsz %u bufalign %u chunksz %u objsz %u slabsz %u", (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign, (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize, (uint32_t)skm->skm_slabsize); if (skmem_cache_ready) { skmem_cache_magazine_enable(skm, 0); } if (cflags & SKMEM_CR_RECLAIM) { skm->skm_mode |= SKM_MODE_RECLAIM; } return skm; } /* * Destroy a cache. */ void skmem_cache_destroy(struct skmem_cache *skm) { uint32_t cpuid; SKMEM_CACHE_LOCK(); TAILQ_REMOVE(&skmem_cache_head, skm, skm_link); SKMEM_CACHE_UNLOCK(); ASSERT(skm->skm_rs_busy == 0); ASSERT(skm->skm_rs_want == 0); /* purge all cached objects for this cache */ skmem_cache_magazine_purge(skm); /* * Panic if we detect there are unfreed objects; the caller * destroying this cache is responsible for ensuring that all * allocated objects have been freed prior to getting here. */ SKM_SLAB_LOCK(skm); if (skm->skm_sl_bufinuse != 0) { panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__, skm->skm_name, (void *)skm, skm->skm_sl_bufinuse); /* NOTREACHED */ __builtin_unreachable(); } ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list)); ASSERT(skm->skm_sl_partial == 0); ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list)); ASSERT(skm->skm_sl_empty == 0); skm->skm_reclaim = NULL; skm->skm_ctor = NULL; skm->skm_dtor = NULL; SKM_SLAB_UNLOCK(skm); if (skm->skm_hash_table != NULL) { #if (DEBUG || DEVELOPMENT) for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) { ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head)); } #endif /* DEBUG || DEVELOPMENT */ sk_free_type_array(struct skmem_bufctl_bkt, skm->skm_hash_mask + 1, skm->skm_hash_table); skm->skm_hash_table = NULL; } for (cpuid = 0; cpuid < ncpu; cpuid++) { lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock, &skmem_cpu_lock_grp); } lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp); lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp); lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp); SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx", skm->skm_name, SK_KVA(skm)); /* callee releases reference */ skmem_region_slab_config(skm->skm_region, skm, false); skm->skm_region = NULL; #if KASAN /* get the original address since we're about to free it */ void **pbuf = (void **)((intptr_t)skm - sizeof(void *)); skm = *pbuf; #endif /* KASAN */ zfree(skm_zone, skm); } /* * Create a slab. */ static struct skmem_slab * skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag) { struct skmem_region *skr = skm->skm_region; uint32_t objsize, chunks; size_t slabsize = skm->skm_slabsize; struct skmem_slab *sl; struct sksegment *sg, *sgm; char *buf, *bufm, *slab, *slabm; /* * Allocate a segment (a slab at our layer) from the region. */ slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag); if (slab == NULL) { goto rg_alloc_failure; } if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) { goto slab_alloc_failure; } ASSERT(sg != NULL); ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index); bzero(sl, sizeof(*sl)); sl->sl_cache = skm; sl->sl_base = buf = slab; sl->sl_basem = bufm = slabm; ASSERT(skr->skr_c_obj_size <= UINT32_MAX); objsize = (uint32_t)skr->skr_c_obj_size; ASSERT(skm->skm_objsize == objsize); ASSERT((slabsize / objsize) <= UINT32_MAX); sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize); sl->sl_seg = sg; sl->sl_segm = sgm; /* * Create one or more buffer control structures for the slab, * each one tracking a chunk of raw object from the segment, * and insert these into the slab's list of buffer controls. */ ASSERT(chunks > 0); while (chunks != 0) { struct skmem_bufctl *bc; bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP); if (bc == NULL) { goto bufctl_alloc_failure; } bzero(bc, bc_size); bc->bc_addr = buf; bc->bc_addrm = bufm; bc->bc_slab = sl; bc->bc_idx = (sl->sl_chunks - chunks); if (skr->skr_mode & SKR_MODE_SHAREOK) { bc->bc_flags |= SKMEM_BUFCTL_SHAREOK; } SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link); bc->bc_lim = objsize; buf += objsize; if (bufm != NULL) { bufm += objsize; } --chunks; } SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx", SK_KVA(skm), SK_KVA(sl)); SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index, SK_KVA(slab), SK_KVA(slab + objsize)); return sl; bufctl_alloc_failure: skmem_slab_destroy(skm, sl); slab_alloc_failure: skmem_region_free(skr, slab, slabm); rg_alloc_failure: os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed); return NULL; } /* * Destroy a slab. */ static void skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl) { struct skmem_bufctl *bc, *tbc; void *slab = sl->sl_base; void *slabm = sl->sl_basem; ASSERT(sl->sl_refcnt == 0); SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx", SK_KVA(skm), SK_KVA(sl)); SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index, SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize)); /* * Go through the slab's list of buffer controls and free * them, and then free the slab itself back to its cache. */ SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) { SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link); skmem_cache_free(skmem_bufctl_cache, bc); } skmem_cache_free(skmem_slab_cache, sl); /* and finally free the segment back to the backing region */ skmem_region_free(skm->skm_region, slab, slabm); } /* * Allocate a raw object from the (locked) slab layer. Normal region variant. */ static int skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag) { struct skmem_bufctl_bkt *bcb; struct skmem_bufctl *bc; struct skmem_slab *sl; uint32_t retries = 0; uint64_t boff_total = 0; /* in usec */ uint64_t boff = 0; /* in msec */ boolean_t new_slab; void *buf; #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) vm_offset_t tagged_address; /* address tagging */ struct skmem_region *region; /* region source for this slab */ #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ /* this flag is not for the caller to set */ VERIFY(!(skmflag & SKMEM_FAILOK)); /* * A slab is either in a partially-allocated list (at least it has * a free object available), or is in the empty list (everything * has been allocated.) If we can't find a partially-allocated * slab, then we need to allocate a slab (segment) from the region. */ again: SKM_SLAB_LOCK_ASSERT_HELD(skm); sl = TAILQ_FIRST(&skm->skm_sl_partial_list); if (sl == NULL) { uint32_t flags = skmflag; boolean_t retry; ASSERT(skm->skm_sl_partial == 0); SKM_SLAB_UNLOCK(skm); if (!(flags & SKMEM_NOSLEEP)) { /* * Pick up a random value to start the exponential * backoff, if this is the first round, or if the * current value is over the threshold. Otherwise, * double the backoff value. */ if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) { read_frandom(&boff, sizeof(boff)); boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1; ASSERT(boff > 0); } else if (os_mul_overflow(boff, 2, &boff)) { panic_plain("\"%s\": boff counter " "overflows\n", skm->skm_name); /* NOTREACHED */ __builtin_unreachable(); } /* add this value (in msec) to the total (in usec) */ if (os_add_overflow(boff_total, (boff * NSEC_PER_USEC), &boff_total)) { panic_plain("\"%s\": boff_total counter " "overflows\n", skm->skm_name); /* NOTREACHED */ __builtin_unreachable(); } } /* * In the event of a race between multiple threads trying * to create the last remaining (or the only) slab, let the * loser(s) attempt to retry after waiting a bit. The winner * would have inserted the newly-created slab into the list. */ if (!(flags & SKMEM_NOSLEEP) && boff_total <= SKMEM_SLAB_MAX_BACKOFF) { retry = TRUE; ++retries; flags |= SKMEM_FAILOK; } else { if (!(flags & SKMEM_NOSLEEP)) { panic_plain("\"%s\": failed to allocate " "slab (sleeping mode) after %llu " "msec, %u retries\n\n%s", skm->skm_name, (boff_total / NSEC_PER_USEC), retries, skmem_dump(skm->skm_region)); /* NOTREACHED */ __builtin_unreachable(); } retry = FALSE; } /* * Create a new slab. */ if ((sl = skmem_slab_create(skm, flags)) == NULL) { if (retry) { SK_ERR("\"%s\": failed to allocate " "slab (%ssleeping mode): waiting for %llu " "msec, total %llu msec, %u retries", skm->skm_name, (flags & SKMEM_NOSLEEP) ? "non-" : "", boff, (boff_total / NSEC_PER_USEC), retries); VERIFY(boff > 0 && ((uint32_t)boff <= (SKMEM_SLAB_BACKOFF_THRES * 2))); delay((uint32_t)boff * NSEC_PER_USEC); SKM_SLAB_LOCK(skm); goto again; } else { SK_RDERR(4, "\"%s\": failed to allocate slab " "(%ssleeping mode)", skm->skm_name, (flags & SKMEM_NOSLEEP) ? "non-" : ""); SKM_SLAB_LOCK(skm); } return ENOMEM; } SKM_SLAB_LOCK(skm); skm->skm_sl_create++; if ((skm->skm_sl_bufinuse += sl->sl_chunks) > skm->skm_sl_bufmax) { skm->skm_sl_bufmax = skm->skm_sl_bufinuse; } } skm->skm_sl_alloc++; new_slab = (sl->sl_refcnt == 0); ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl)); sl->sl_refcnt++; ASSERT(sl->sl_refcnt <= sl->sl_chunks); /* * We either have a new slab, or a partially-allocated one. * Remove a buffer control from the slab, and insert it to * the allocated-address hash chain. */ bc = SLIST_FIRST(&sl->sl_head); ASSERT(bc != NULL); SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link); /* sanity check */ VERIFY(bc->bc_usecnt == 0); /* * Also store the master object's region info for the caller. */ bzero(oi, sizeof(*oi)); #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) region = sl->sl_cache->skm_region; if (region->skr_mode & SKR_MODE_MEMTAG) { /* * If this region is configured to be tagged, we generate a * unique tag for the object address, and return this tagged * address to the caller. vm_memtag_assign_tag generates a * unique tag for the given address and size, and * vm_memtag_set_tag commits the tag to the backing memory * metadata. This tagged address is returned back to the client, * and when the client frees the address, we "re-tag" the * address to prevent against use-after-free attacks (more on * this in skmem_cache_batch_free). */ tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr, skm->skm_objsize); vm_memtag_set_tag(tagged_address, skm->skm_objsize); buf = (void *)tagged_address; } else { buf = bc->bc_addr; } #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ buf = bc->bc_addr; #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ SKMEM_OBJ_ADDR(oi) = buf; SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */ ASSERT(skm->skm_objsize <= UINT32_MAX); SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize; SKMEM_OBJ_IDX_REG(oi) = ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx); SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx; /* * And for slave object. */ if (oim != NULL) { bzero(oim, sizeof(*oim)); if (bc->bc_addrm != NULL) { SKMEM_OBJ_ADDR(oim) = bc->bc_addrm; SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi); SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi); SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi); } } if (skm->skm_mode & SKM_MODE_BATCH) { ((struct skmem_obj *)buf)->mo_next = NULL; } /* insert to allocated-address hash chain */ bcb = SKMEM_CACHE_HASH(skm, buf); SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link); if (SLIST_EMPTY(&sl->sl_head)) { /* * If that was the last buffer control from this slab, * insert the slab into the empty list. If it was in * the partially-allocated list, then remove the slab * from there as well. */ ASSERT(sl->sl_refcnt == sl->sl_chunks); if (new_slab) { ASSERT(sl->sl_chunks == 1); } else { ASSERT(sl->sl_chunks > 1); ASSERT(skm->skm_sl_partial > 0); skm->skm_sl_partial--; TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link); } skm->skm_sl_empty++; ASSERT(skm->skm_sl_empty != 0); TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link); } else { /* * The slab is not empty; if it was newly allocated * above, then it's not in the partially-allocated * list and so we insert it there. */ ASSERT(SKMEM_SLAB_IS_PARTIAL(sl)); if (new_slab) { skm->skm_sl_partial++; ASSERT(skm->skm_sl_partial != 0); TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list, sl, sl_link); } } /* if auditing is enabled, record this transaction */ if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) { skmem_audit_bufctl(bc); } return 0; } /* * Allocate a raw object from the (locked) slab layer. Pseudo region variant. */ static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm, struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag) { zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK; struct skmem_region *skr = skm->skm_region; void *obj, *buf; /* this flag is not for the caller to set */ VERIFY(!(skmflag & SKMEM_FAILOK)); SKM_SLAB_LOCK_ASSERT_HELD(skm); ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL); /* mirrored region is not applicable */ ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED)); /* batching is not yet supported */ ASSERT(!(skm->skm_mode & SKM_MODE_BATCH)); if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) { os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed); return ENOMEM; } #if KASAN /* * Perform some fix-ups since the zone element isn't guaranteed * to be on the aligned boundary. The effective object size * has been adjusted accordingly by skmem_region_create() earlier * at cache creation time. * * 'buf' is get the aligned address for this object. */ buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t), skm->skm_bufalign); /* * Wind back a pointer size from the aligned address and * save the original address so we can free it later. */ void **pbuf = (void **)((intptr_t)buf - sizeof(void *)); *pbuf = obj; VERIFY(((intptr_t)buf + skm->skm_bufsize) <= ((intptr_t)obj + skm->skm_objsize)); #else /* !KASAN */ /* * We expect that the zone allocator would allocate elements * rounded up to the requested alignment based on the effective * object size computed in skmem_region_create() earlier, and * 'buf' is therefore the element address itself. */ buf = obj; #endif /* !KASAN */ /* make sure the object is aligned */ VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign)); /* * Return the object's info to the caller. */ bzero(oi, sizeof(*oi)); SKMEM_OBJ_ADDR(oi) = buf; ASSERT(skm->skm_objsize <= UINT32_MAX); SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize; if (oim != NULL) { bzero(oim, sizeof(*oim)); } skm->skm_sl_alloc++; skm->skm_sl_bufinuse++; if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) { skm->skm_sl_bufmax = skm->skm_sl_bufinuse; } return 0; } /* * Allocate a raw object from the slab layer. */ static int skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag) { int err; SKM_SLAB_LOCK(skm); err = skm->skm_slab_alloc(skm, oi, oim, skmflag); SKM_SLAB_UNLOCK(skm); return err; } /* * Allocate raw object(s) from the slab layer. */ static uint32_t skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list, uint32_t num, uint32_t skmflag) { uint32_t need = num; ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH)); *list = NULL; SKM_SLAB_LOCK(skm); for (;;) { struct skmem_obj_info oi, oim; /* * Get a single raw object from the slab layer. */ if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) { break; } *list = SKMEM_OBJ_ADDR(&oi); ASSERT((*list)->mo_next == NULL); /* store these inside the object itself */ (*list)->mo_info = oi; (*list)->mo_minfo = oim; list = &(*list)->mo_next; ASSERT(need != 0); if (--need == 0) { break; } } SKM_SLAB_UNLOCK(skm); return num - need; } /* * Free a raw object to the (locked) slab layer. Normal region variant. */ static void skmem_slab_free_locked(struct skmem_cache *skm, void *buf) { struct skmem_bufctl *bc, *tbc; struct skmem_bufctl_bkt *bcb; struct skmem_slab *sl = NULL; #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) struct skmem_region *region; vm_offset_t tagged_addr; /* * If buf is tagged, then addr would have the canonicalized address. * If buf is untagged, then addr is same as buf. */ void *addr = (void *)vm_memtag_canonicalize_address((vm_offset_t)buf); #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ SKM_SLAB_LOCK_ASSERT_HELD(skm); ASSERT(buf != NULL); /* caller is expected to clear mo_next */ ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) || ((struct skmem_obj *)buf)->mo_next == NULL); /* * Search the hash chain to find a matching buffer control for the * given object address. If found, remove the buffer control from * the hash chain and insert it into the freelist. Otherwise, we * panic since the caller has given us a bogus address. */ skm->skm_sl_free++; bcb = SKMEM_CACHE_HASH(skm, buf); #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) /* * If this region is configured to tag memory addresses, then buf is a * tagged address. When we search for the buffer control from the hash * table, we need to use the untagged address, because buffer control * maintains untagged address (bc_addr). vm_memtag_canonicalize_address * returns the untagged address. */ SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) { if (bc->bc_addr == addr) { SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link); sl = bc->bc_slab; break; } } #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) { if (bc->bc_addr == buf) { SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link); sl = bc->bc_slab; break; } } #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ if (bc == NULL) { panic("%s: attempt to free invalid or already-freed obj %p " "on skm %p", __func__, buf, skm); /* NOTREACHED */ __builtin_unreachable(); } ASSERT(sl != NULL && sl->sl_cache == skm); #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) /* * We use untagged address here, because SKMEM_SLAB_MEMBER compares the * address against sl_base, which is untagged. */ VERIFY(SKMEM_SLAB_MEMBER(sl, addr)); #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ VERIFY(SKMEM_SLAB_MEMBER(sl, buf)); #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ /* make sure this object is not currently in use by another object */ VERIFY(bc->bc_usecnt == 0); /* if auditing is enabled, record this transaction */ if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) { skmem_audit_bufctl(bc); } /* if clear on free is requested, zero out the object */ if (skm->skm_mode & SKM_MODE_CLEARONFREE) { bzero(buf, skm->skm_objsize); } #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) /* * If this region is configured to tag memory addresses, we re-tag this * address as the object is freed. We do the re-tagging in the magazine * layer too, but in case we need to free raw objects to the slab layer * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer * was not able to allocate empty magazines), we re-tag the addresses * here in the slab layer. Freeing to the slab layer is symmetrical to * allocating from the slab layer - when we allocate from slab layer, we * tag the address, and then construct the object; when we free to the * slab layer, we destruct the object, and retag the address. * We do the re-tagging here, because this is right after the last usage * of the buf variable (which is tagged). */ region = skm->skm_region; if (region->skr_mode & SKR_MODE_MEMTAG) { tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf, skm->skm_objsize); vm_memtag_set_tag(tagged_addr, skm->skm_objsize); } #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ /* insert the buffer control to the slab's freelist */ SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link); ASSERT(sl->sl_refcnt >= 1); if (--sl->sl_refcnt == 0) { /* * If this was the last outstanding object for the slab, * remove the slab from the partially-allocated or empty * list, and destroy the slab (segment) back to the region. */ if (sl->sl_chunks == 1) { ASSERT(skm->skm_sl_empty > 0); skm->skm_sl_empty--; TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link); } else { ASSERT(skm->skm_sl_partial > 0); skm->skm_sl_partial--; TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link); } ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0); skm->skm_sl_bufinuse -= sl->sl_chunks; skm->skm_sl_destroy++; SKM_SLAB_UNLOCK(skm); skmem_slab_destroy(skm, sl); SKM_SLAB_LOCK(skm); return; } ASSERT(bc == SLIST_FIRST(&sl->sl_head)); if (SLIST_NEXT(bc, bc_link) == NULL) { /* * If this is the first (potentially amongst many) object * that's returned to the slab, remove the slab from the * empty list and insert to end of the partially-allocated * list. This should help avoid thrashing the partial slab * since we avoid disturbing what's already at the front. */ ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1)); ASSERT(sl->sl_chunks > 1); ASSERT(skm->skm_sl_empty > 0); skm->skm_sl_empty--; TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link); skm->skm_sl_partial++; ASSERT(skm->skm_sl_partial != 0); TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link); } } /* * Free a raw object to the (locked) slab layer. Pseudo region variant. */ static void skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf) { struct skmem_region *skr = skm->skm_region; void *obj = buf; ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL); SKM_SLAB_LOCK_ASSERT_HELD(skm); VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign)); #if KASAN /* * Since we stuffed the original zone element address before * the buffer address in KASAN mode, get it back since we're * about to free it. */ void **pbuf = (void **)((intptr_t)obj - sizeof(void *)); VERIFY(((intptr_t)obj + skm->skm_bufsize) <= ((intptr_t)*pbuf + skm->skm_objsize)); obj = *pbuf; #endif /* KASAN */ /* free it to zone */ zfree(skr->skr_zreg, obj); skm->skm_sl_free++; ASSERT(skm->skm_sl_bufinuse > 0); skm->skm_sl_bufinuse--; } /* * Free a raw object to the slab layer. */ static void skmem_slab_free(struct skmem_cache *skm, void *buf) { if (skm->skm_mode & SKM_MODE_BATCH) { ((struct skmem_obj *)buf)->mo_next = NULL; } SKM_SLAB_LOCK(skm); skm->skm_slab_free(skm, buf); SKM_SLAB_UNLOCK(skm); } /* * Free raw object(s) to the slab layer. */ static void skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list) { struct skmem_obj *listn; ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH)); SKM_SLAB_LOCK(skm); for (;;) { listn = list->mo_next; list->mo_next = NULL; /* * Free a single object to the slab layer. */ skm->skm_slab_free(skm, (void *)list); /* if no more objects to free, we're done */ if ((list = listn) == NULL) { break; } } SKM_SLAB_UNLOCK(skm); } /* * Return the object's region info. */ void skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf, struct skmem_obj_info *oi, struct skmem_obj_info *oim) { struct skmem_bufctl_bkt *bcb; struct skmem_bufctl *bc; struct skmem_slab *sl; /* * Search the hash chain to find a matching buffer control for the * given object address. If not found, panic since the caller has * given us a bogus address. */ SKM_SLAB_LOCK(skm); bcb = SKMEM_CACHE_HASH(skm, buf); SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) { if (bc->bc_addr == buf) { break; } } if (__improbable(bc == NULL)) { panic("%s: %s failed to get object info for %p", __func__, skm->skm_name, buf); /* NOTREACHED */ __builtin_unreachable(); } /* * Return the master object's info to the caller. */ sl = bc->bc_slab; SKMEM_OBJ_ADDR(oi) = bc->bc_addr; SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */ ASSERT(skm->skm_objsize <= UINT32_MAX); SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize; SKMEM_OBJ_IDX_REG(oi) = (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx; SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx; /* * And for slave object. */ if (oim != NULL) { bzero(oim, sizeof(*oim)); if (bc->bc_addrm != NULL) { SKMEM_OBJ_ADDR(oim) = bc->bc_addrm; SKMEM_OBJ_SIZE(oim) = oi->oi_size; SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg; SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg; } } SKM_SLAB_UNLOCK(skm); } /* * Magazine constructor. */ static int skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim, void *arg, uint32_t skmflag) { #pragma unused(oim, skmflag) struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi); ASSERT(oim == NULL); ASSERT(arg != NULL); /* * Store it in the magazine object since we'll * need to refer to it during magazine destroy; * we can't safely refer to skm_magtype as the * depot lock may not be acquired then. */ mg->mg_magtype = arg; return 0; } /* * Destroy a magazine (free each object to the slab layer). */ static void skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg, int nrounds) { int round; for (round = 0; round < nrounds; round++) { void *buf = mg->mg_round[round]; struct skmem_obj *next; if (skm->skm_mode & SKM_MODE_BATCH) { next = ((struct skmem_obj *)buf)->mo_next; ((struct skmem_obj *)buf)->mo_next = NULL; } /* deconstruct the object */ if (skm->skm_dtor != NULL) { skm->skm_dtor(buf, skm->skm_private); } /* * In non-batching mode, each object in the magazine has * no linkage to its neighbor, so free individual object * to the slab layer now. */ if (!(skm->skm_mode & SKM_MODE_BATCH)) { skmem_slab_free(skm, buf); } else { ((struct skmem_obj *)buf)->mo_next = next; } } /* * In batching mode, each object is linked to its neighbor at free * time, and so take the bottom-most object and free it to the slab * layer. Because of the way the list is reversed during free, this * will bring along the rest of objects above it. */ if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) { skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]); } /* free the magazine itself back to cache */ skmem_cache_free(mg->mg_magtype->mt_cache, mg); } /* * Get one or more magazines from the depot. */ static uint32_t skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml, uint32_t *count, struct skmem_mag **list, uint32_t num) { SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list); struct skmem_mag *mg; uint32_t need = num, c = 0; ASSERT(list != NULL && need > 0); if (!SKM_DEPOT_LOCK_TRY(skm)) { /* * Track the amount of lock contention here; if the contention * level is high (more than skmem_cache_depot_contention per a * given skmem_cache_update_interval interval), then we treat * it as a sign that the per-CPU layer is not using the right * magazine type, and that we'd need to resize it. */ SKM_DEPOT_LOCK(skm); if (skm->skm_mode & SKM_MODE_DYNAMIC) { skm->skm_depot_contention++; } } while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) { SLIST_REMOVE_HEAD(&ml->ml_list, mg_link); SLIST_INSERT_HEAD(&mg_list, mg, mg_link); ASSERT(ml->ml_total != 0); if (--ml->ml_total < ml->ml_min) { ml->ml_min = ml->ml_total; } c++; ml->ml_alloc++; if (--need == 0) { break; } } *count -= c; SKM_DEPOT_UNLOCK(skm); *list = SLIST_FIRST(&mg_list); return num - need; } /* * Return one or more magazines to the depot. */ static void skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml, uint32_t *count, struct skmem_mag *mg) { struct skmem_mag *nmg; uint32_t c = 0; SKM_DEPOT_LOCK(skm); while (mg != NULL) { nmg = SLIST_NEXT(mg, mg_link); SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link); ml->ml_total++; c++; mg = nmg; } *count += c; SKM_DEPOT_UNLOCK(skm); } /* * Update the depot's working state statistics. */ static void skmem_depot_ws_update(struct skmem_cache *skm) { SKM_DEPOT_LOCK_SPIN(skm); skm->skm_full.ml_reaplimit = skm->skm_full.ml_min; skm->skm_full.ml_min = skm->skm_full.ml_total; skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min; skm->skm_empty.ml_min = skm->skm_empty.ml_total; SKM_DEPOT_UNLOCK(skm); } /* * Empty the depot's working state statistics (everything's reapable.) */ static void skmem_depot_ws_zero(struct skmem_cache *skm) { SKM_DEPOT_LOCK_SPIN(skm); if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total || skm->skm_full.ml_min != skm->skm_full.ml_total || skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total || skm->skm_empty.ml_min != skm->skm_empty.ml_total) { skm->skm_full.ml_reaplimit = skm->skm_full.ml_total; skm->skm_full.ml_min = skm->skm_full.ml_total; skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total; skm->skm_empty.ml_min = skm->skm_empty.ml_total; skm->skm_depot_ws_zero++; } SKM_DEPOT_UNLOCK(skm); } /* * Reap magazines that's outside of the working set. */ static void skmem_depot_ws_reap(struct skmem_cache *skm) { struct skmem_mag *mg, *nmg; uint32_t f, e, reap; reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min); if (reap != 0) { (void) skmem_depot_batch_alloc(skm, &skm->skm_full, &skm->skm_depot_full, &mg, reap); while (mg != NULL) { nmg = SLIST_NEXT(mg, mg_link); SLIST_NEXT(mg, mg_link) = NULL; skmem_magazine_destroy(skm, mg, mg->mg_magtype->mt_magsize); mg = nmg; } } reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min); if (reap != 0) { (void) skmem_depot_batch_alloc(skm, &skm->skm_empty, &skm->skm_depot_empty, &mg, reap); while (mg != NULL) { nmg = SLIST_NEXT(mg, mg_link); SLIST_NEXT(mg, mg_link) = NULL; skmem_magazine_destroy(skm, mg, 0); mg = nmg; } } if (f != 0 || e != 0) { os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed); } } /* * Performs periodic maintenance on a cache. This is serialized * through the update thread call, and so we guarantee there's at * most one update episode in the system at any given time. */ static void skmem_cache_update(struct skmem_cache *skm, uint32_t arg) { #pragma unused(arg) boolean_t resize_mag = FALSE; boolean_t rescale_hash = FALSE; SKMEM_CACHE_LOCK_ASSERT_HELD(); /* insist that we are executing in the update thread call context */ ASSERT(sk_is_cache_update_protected()); /* * If the cache has become much larger or smaller than the * allocated-address hash table, rescale the hash table. */ SKM_SLAB_LOCK(skm); if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) && (skm->skm_hash_mask + 1) < skm->skm_hash_limit) || (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) && skm->skm_hash_mask > skm->skm_hash_initial)) { rescale_hash = TRUE; } SKM_SLAB_UNLOCK(skm); /* * Update the working set. */ skmem_depot_ws_update(skm); /* * If the contention count is greater than the threshold during * the update interval, and if we are not already at the maximum * magazine size, increase it. */ SKM_DEPOT_LOCK_SPIN(skm); if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf && (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) > skmem_cache_depot_contention) { ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC); resize_mag = TRUE; } skm->skm_depot_contention_prev = skm->skm_depot_contention; SKM_DEPOT_UNLOCK(skm); if (rescale_hash) { skmem_cache_hash_rescale(skm); } if (resize_mag) { skmem_cache_magazine_resize(skm); } } /* * Reload the CPU's magazines with mg and its follower (if any). */ static void skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds) { ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) || (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize)); ASSERT(cp->cp_magsize > 0); cp->cp_loaded = mg; cp->cp_rounds = rounds; if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) { cp->cp_ploaded = SLIST_NEXT(mg, mg_link); cp->cp_prounds = rounds; SLIST_NEXT(mg, mg_link) = NULL; } else { ASSERT(SLIST_NEXT(mg, mg_link) == NULL); cp->cp_ploaded = NULL; cp->cp_prounds = -1; } } /* * Reload the CPU's magazine with mg and save the previous one. */ static void skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds) { ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) || (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize)); ASSERT(cp->cp_magsize > 0); cp->cp_ploaded = cp->cp_loaded; cp->cp_prounds = cp->cp_rounds; cp->cp_loaded = mg; cp->cp_rounds = rounds; } /* * Allocate a constructed object from the cache. */ void * skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag) { struct skmem_obj *buf; (void) skmem_cache_batch_alloc(skm, &buf, 1, skmflag); return buf; } /* * Allocate constructed object(s) from the cache. */ uint32_t skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list, uint32_t num, uint32_t skmflag) { struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm); struct skmem_obj **top = &(*list); struct skmem_mag *mg; uint32_t need = num; ASSERT(list != NULL); *list = NULL; if (need == 0) { return 0; } ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH)); SKM_CPU_LOCK(cp); for (;;) { /* * If we have an object in the current CPU's loaded * magazine, return it and we're done. */ if (cp->cp_rounds > 0) { int objs = MIN((unsigned int)cp->cp_rounds, need); /* * In the SKM_MODE_BATCH case, objects in are already * linked together with the most recently freed object * at the head of the list; grab as many objects as we * can. Otherwise we'll just grab 1 object at most. */ *list = cp->cp_loaded->mg_round[cp->cp_rounds - 1]; cp->cp_rounds -= objs; cp->cp_alloc += objs; if (skm->skm_mode & SKM_MODE_BATCH) { struct skmem_obj *tail = cp->cp_loaded->mg_round[cp->cp_rounds]; list = &tail->mo_next; *list = NULL; } /* if we got them all, return to caller */ if ((need -= objs) == 0) { SKM_CPU_UNLOCK(cp); goto done; } } /* * The CPU's loaded magazine is empty. If the previously * loaded magazine was full, exchange and try again. */ if (cp->cp_prounds > 0) { skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds); continue; } /* * If the magazine layer is disabled, allocate from slab. * This can happen either because SKM_MODE_NOMAGAZINES is * set, or because we are resizing the magazine now. */ if (cp->cp_magsize == 0) { break; } /* * Both of the CPU's magazines are empty; try to get * full magazine(s) from the depot layer. Upon success, * reload and try again. To prevent potential thrashing, * replace both empty magazines only if the requested * count exceeds a magazine's worth of objects. */ (void) skmem_depot_batch_alloc(skm, &skm->skm_full, &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2); if (mg != NULL) { SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list); if (cp->cp_ploaded != NULL) { SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded, mg_link); } if (SLIST_NEXT(mg, mg_link) == NULL) { /* * Depot allocation returns only 1 magazine; * retain current empty magazine. */ skmem_cpu_reload(cp, mg, cp->cp_magsize); } else { /* * We got 2 full magazines from depot; * release the current empty magazine * back to the depot layer. */ if (cp->cp_loaded != NULL) { SLIST_INSERT_HEAD(&mg_list, cp->cp_loaded, mg_link); } skmem_cpu_batch_reload(cp, mg, cp->cp_magsize); } skmem_depot_batch_free(skm, &skm->skm_empty, &skm->skm_depot_empty, SLIST_FIRST(&mg_list)); continue; } /* * The depot layer doesn't have any full magazines; * allocate directly from the slab layer. */ break; } SKM_CPU_UNLOCK(cp); if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) { struct skmem_obj *rtop, *rlist, *rlistp = NULL; uint32_t rlistc, c = 0; /* * Get a list of raw objects from the slab layer. */ rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag); ASSERT(rlistc == 0 || rlist != NULL); rtop = rlist; /* * Construct each object in the raw list. Upon failure, * free any remaining objects in the list back to the slab * layer, and keep the ones that were successfully constructed. * Here, "oi" and "oim" in each skmem_obj refer to the objects * coming from the master and slave regions (on mirrored * regions), respectively. They are stored inside the object * temporarily so that we can pass them to the constructor. */ while (skm->skm_ctor != NULL && rlist != NULL) { struct skmem_obj_info *oi = &rlist->mo_info; struct skmem_obj_info *oim = &rlist->mo_minfo; struct skmem_obj *rlistn = rlist->mo_next; /* * Note that the constructor guarantees at least * the size of a pointer at the top of the object * and no more than that. That means we must not * refer to "oi" and "oim" any longer after the * object goes thru the constructor. */ if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ? oim : NULL), skm->skm_private, skmflag) != 0) { VERIFY(rlist->mo_next == rlistn); os_atomic_add(&skm->skm_sl_alloc_fail, rlistc - c, relaxed); if (rlistp != NULL) { rlistp->mo_next = NULL; } if (rlist == rtop) { rtop = NULL; ASSERT(c == 0); } skmem_slab_batch_free(skm, rlist); rlist = NULL; rlistc = c; break; } VERIFY(rlist->mo_next == rlistn); ++c; /* # of constructed objs */ rlistp = rlist; if ((rlist = rlist->mo_next) == NULL) { ASSERT(rlistc == c); break; } } /* * At this point "top" points to the head of the chain we're * going to return to caller; "list" points to the tail of that * chain. The second chain begins at "rtop", and we append * that after "list" to form a single chain. "rlistc" is the * number of objects in "rtop" originated from the slab layer * that have been successfully constructed (if applicable). */ ASSERT(c == 0 || rtop != NULL); need -= rlistc; *list = rtop; } else { struct skmem_obj_info oi, oim; void *buf; ASSERT(*top == NULL && num == 1 && need == 1); /* * Get a single raw object from the slab layer. */ if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) { goto done; } buf = SKMEM_OBJ_ADDR(&oi); ASSERT(buf != NULL); /* * Construct the raw object. Here, "oi" and "oim" refer to * the objects coming from the master and slave regions (on * mirrored regions), respectively. */ if (skm->skm_ctor != NULL && skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ? &oim : NULL), skm->skm_private, skmflag) != 0) { os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed); skmem_slab_free(skm, buf); goto done; } need = 0; *list = buf; ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) || (*list)->mo_next == NULL); } done: /* if auditing is enabled, record this transaction */ if (__improbable(*top != NULL && (skm->skm_mode & SKM_MODE_AUDIT) != 0)) { skmem_audit_buf(skm, *top); } return num - need; } /* * Free a constructed object to the cache. */ void skmem_cache_free(struct skmem_cache *skm, void *buf) { if (skm->skm_mode & SKM_MODE_BATCH) { ((struct skmem_obj *)buf)->mo_next = NULL; } skmem_cache_batch_free(skm, (struct skmem_obj *)buf); } void skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list) { struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm); struct skmem_magtype *mtp; struct skmem_mag *mg; struct skmem_obj *listn; #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) vm_offset_t tagged_address; /* address tagging */ struct skmem_region *region; /* region source for this cache */ #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ /* if auditing is enabled, record this transaction */ if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) { skmem_audit_buf(skm, list); } SKM_CPU_LOCK(cp); for (;;) { /* * If there's an available space in the current CPU's * loaded magazine, place it there and we're done. */ if ((unsigned int)cp->cp_rounds < (unsigned int)cp->cp_magsize) { /* * In the SKM_MODE_BATCH case, reverse the list * while we place each object into the magazine; * this effectively causes the most recently * freed object to be reused during allocation. */ if (skm->skm_mode & SKM_MODE_BATCH) { listn = list->mo_next; list->mo_next = (cp->cp_rounds == 0) ? NULL : cp->cp_loaded->mg_round[cp->cp_rounds - 1]; } else { listn = NULL; } #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) /* * If this region is configured to be tagged, we re-tag * the address that's being freed, to protect against * use-after-free bugs. This "re-tagged" address will * reside in the CPU's loaded magazine, and when cache * alloc is called, it is returned to client as is. At * this point, we know that this object will be freed to * the CPU's loaded magazine and not down to the slab * layer, so we won't be double tagging the same address * in the magazine layer and slab layer. */ region = skm->skm_region; if (region->skr_mode & SKR_MODE_MEMTAG) { tagged_address = vm_memtag_assign_tag( (vm_offset_t)list, skm->skm_objsize); vm_memtag_set_tag(tagged_address, skm->skm_objsize); cp->cp_loaded->mg_round[cp->cp_rounds++] = (void *)tagged_address; } else { cp->cp_loaded->mg_round[cp->cp_rounds++] = list; } #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ cp->cp_loaded->mg_round[cp->cp_rounds++] = list; #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */ cp->cp_free++; if ((list = listn) != NULL) { continue; } SKM_CPU_UNLOCK(cp); return; } /* * The loaded magazine is full. If the previously * loaded magazine was empty, exchange and try again. */ if (cp->cp_prounds == 0) { skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds); continue; } /* * If the magazine layer is disabled, free to slab. * This can happen either because SKM_MODE_NOMAGAZINES * is set, or because we are resizing the magazine now. */ if (cp->cp_magsize == 0) { break; } /* * Both magazines for the CPU are full; try to get * empty magazine(s) from the depot. If we get one, * exchange a full magazine with it and place the * object in there. * * TODO: Because the caller currently doesn't indicate * the number of objects in the list, we choose the more * conservative approach of allocating only 1 empty * magazine (to prevent potential thrashing). Once we * have the object count, we can replace 1 with similar * logic as used in skmem_cache_batch_alloc(). */ (void) skmem_depot_batch_alloc(skm, &skm->skm_empty, &skm->skm_depot_empty, &mg, 1); if (mg != NULL) { SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list); if (cp->cp_ploaded != NULL) { SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded, mg_link); } if (SLIST_NEXT(mg, mg_link) == NULL) { /* * Depot allocation returns only 1 magazine; * retain current full magazine. */ skmem_cpu_reload(cp, mg, 0); } else { /* * We got 2 empty magazines from depot; * release the current full magazine back * to the depot layer. */ if (cp->cp_loaded != NULL) { SLIST_INSERT_HEAD(&mg_list, cp->cp_loaded, mg_link); } skmem_cpu_batch_reload(cp, mg, 0); } skmem_depot_batch_free(skm, &skm->skm_full, &skm->skm_depot_full, SLIST_FIRST(&mg_list)); continue; } /* * We can't get any empty magazine from the depot, and * so we need to allocate one. If the allocation fails, * just fall through, deconstruct and free the object * to the slab layer. */ mtp = skm->skm_magtype; SKM_CPU_UNLOCK(cp); mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP); SKM_CPU_LOCK(cp); if (mg != NULL) { /* * We allocated an empty magazine, but since we * dropped the CPU lock above the magazine size * may have changed. If that's the case free * the magazine and try again. */ if (cp->cp_magsize != mtp->mt_magsize) { SKM_CPU_UNLOCK(cp); skmem_cache_free(mtp->mt_cache, mg); SKM_CPU_LOCK(cp); continue; } /* * We have a magazine with the right size; * add it to the depot and try again. */ ASSERT(SLIST_NEXT(mg, mg_link) == NULL); skmem_depot_batch_free(skm, &skm->skm_empty, &skm->skm_depot_empty, mg); continue; } /* * We can't get an empty magazine, so free to slab. */ break; } SKM_CPU_UNLOCK(cp); /* * We weren't able to free the constructed object(s) to the * magazine layer, so deconstruct them and free to the slab. */ if (__probable((skm->skm_mode & SKM_MODE_BATCH) && list->mo_next != NULL)) { /* whatever is left from original list */ struct skmem_obj *top = list; while (list != NULL && skm->skm_dtor != NULL) { listn = list->mo_next; list->mo_next = NULL; /* deconstruct the object */ if (skm->skm_dtor != NULL) { skm->skm_dtor((void *)list, skm->skm_private); } list->mo_next = listn; list = listn; } skmem_slab_batch_free(skm, top); } else { /* deconstruct the object */ if (skm->skm_dtor != NULL) { skm->skm_dtor((void *)list, skm->skm_private); } skmem_slab_free(skm, (void *)list); } } /* * Return the maximum number of objects cached at the magazine layer * based on the chunk size. This takes into account the starting * magazine type as well as the final magazine type used in resizing. */ uint32_t skmem_cache_magazine_max(uint32_t chunksize) { struct skmem_magtype *mtp; uint32_t magsize_max; VERIFY(ncpu != 0); VERIFY(chunksize > 0); /* find a suitable magazine type for this chunk size */ for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) { continue; } /* and find the last magazine type */ for (;;) { magsize_max = mtp->mt_magsize; if (mtp == skmem_cache_magsize_last || chunksize >= mtp->mt_maxbuf) { break; } ++mtp; VERIFY(mtp <= skmem_cache_magsize_last); } return ncpu * magsize_max * 2; /* two magazines per CPU */ } /* * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug. */ boolean_t skmem_allow_magazines(void) { return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES); } /* * Purge all magazines from a cache and disable its per-CPU magazines layer. */ static void skmem_cache_magazine_purge(struct skmem_cache *skm) { struct skmem_cpu_cache *cp; struct skmem_mag *mg, *pmg; int rounds, prounds; uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0; SKM_SLAB_LOCK_ASSERT_NOTHELD(skm); SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm)); for (cpuid = 0; cpuid < ncpu; cpuid++) { cp = &skm->skm_cpu_cache[cpuid]; SKM_CPU_LOCK_SPIN(cp); mg = cp->cp_loaded; pmg = cp->cp_ploaded; rounds = cp->cp_rounds; prounds = cp->cp_prounds; cp->cp_loaded = NULL; cp->cp_ploaded = NULL; cp->cp_rounds = -1; cp->cp_prounds = -1; cp->cp_magsize = 0; SKM_CPU_UNLOCK(cp); if (mg != NULL) { skmem_magazine_destroy(skm, mg, rounds); ++mg_cnt; } if (pmg != NULL) { skmem_magazine_destroy(skm, pmg, prounds); ++pmg_cnt; } } if (mg_cnt != 0 || pmg_cnt != 0) { os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed); } skmem_depot_ws_zero(skm); skmem_depot_ws_reap(skm); } /* * Enable magazines on a cache. Must only be called on a cache with * its per-CPU magazines layer disabled (e.g. due to purge). */ static void skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg) { #pragma unused(arg) struct skmem_cpu_cache *cp; uint32_t cpuid; if (skm->skm_mode & SKM_MODE_NOMAGAZINES) { return; } for (cpuid = 0; cpuid < ncpu; cpuid++) { cp = &skm->skm_cpu_cache[cpuid]; SKM_CPU_LOCK_SPIN(cp); /* the magazines layer must be disabled at this point */ ASSERT(cp->cp_loaded == NULL); ASSERT(cp->cp_ploaded == NULL); ASSERT(cp->cp_rounds == -1); ASSERT(cp->cp_prounds == -1); ASSERT(cp->cp_magsize == 0); cp->cp_magsize = skm->skm_magtype->mt_magsize; SKM_CPU_UNLOCK(cp); } SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d", SK_KVA(skm), (uint32_t)skm->skm_chunksize, SKMEM_CPU_CACHE(skm)->cp_magsize); } /* * Enter the cache resize perimeter. Upon success, claim exclusivity * on the perimeter and return 0, else EBUSY. Caller may indicate * whether or not they're willing to wait. */ static int skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep) { SKM_RESIZE_LOCK(skm); if (skm->skm_rs_owner == current_thread()) { ASSERT(skm->skm_rs_busy != 0); skm->skm_rs_busy++; goto done; } if (!can_sleep) { if (skm->skm_rs_busy != 0) { SKM_RESIZE_UNLOCK(skm); return EBUSY; } } else { while (skm->skm_rs_busy != 0) { skm->skm_rs_want++; (void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT); SKM_RESIZE_UNLOCK(skm); (void) thread_block(THREAD_CONTINUE_NULL); SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" " "(0x%llx) busy=%u", skm->skm_name, SK_KVA(skm), skm->skm_rs_busy); SKM_RESIZE_LOCK(skm); } } SKM_RESIZE_LOCK_ASSERT_HELD(skm); ASSERT(skm->skm_rs_busy == 0); skm->skm_rs_busy++; skm->skm_rs_owner = current_thread(); done: SKM_RESIZE_UNLOCK(skm); return 0; } /* * Exit the cache resize perimeter and unblock any waiters. */ static void skmem_cache_resize_exit(struct skmem_cache *skm) { uint32_t want; SKM_RESIZE_LOCK(skm); ASSERT(skm->skm_rs_busy != 0); ASSERT(skm->skm_rs_owner == current_thread()); if (--skm->skm_rs_busy == 0) { skm->skm_rs_owner = NULL; /* * We're done; notify anyone that has lost the race. */ if ((want = skm->skm_rs_want) != 0) { skm->skm_rs_want = 0; wakeup((void *)&skm->skm_rs_busy); SKM_RESIZE_UNLOCK(skm); } else { SKM_RESIZE_UNLOCK(skm); } } else { SKM_RESIZE_UNLOCK(skm); } } /* * Recompute a cache's magazine size. This is an expensive operation * and should not be done frequently; larger magazines provide for a * higher transfer rate with the depot while smaller magazines reduce * the memory consumption. */ static void skmem_cache_magazine_resize(struct skmem_cache *skm) { struct skmem_magtype *mtp = skm->skm_magtype; /* insist that we are executing in the update thread call context */ ASSERT(sk_is_cache_update_protected()); ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES)); /* depot contention only applies to dynamic mode */ ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC); /* * Although we're executing in the context of the update thread * call, we need to protect the per-CPU states during resizing * against other synchronous cache purge/reenable requests that * could take place in parallel. */ if (skm->skm_chunksize < mtp->mt_maxbuf) { (void) skmem_cache_resize_enter(skm, TRUE); skmem_cache_magazine_purge(skm); /* * Upgrade to the next magazine type with larger size. */ SKM_DEPOT_LOCK_SPIN(skm); skm->skm_cpu_mag_resize++; skm->skm_magtype = ++mtp; skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize; skm->skm_depot_contention_prev = skm->skm_depot_contention + INT_MAX; SKM_DEPOT_UNLOCK(skm); skmem_cache_magazine_enable(skm, 0); skmem_cache_resize_exit(skm); } } /* * Rescale the cache's allocated-address hash table. */ static void skmem_cache_hash_rescale(struct skmem_cache *skm) { struct skmem_bufctl_bkt *old_table, *new_table; size_t old_size, new_size; uint32_t i, moved = 0; /* insist that we are executing in the update thread call context */ ASSERT(sk_is_cache_update_protected()); /* * To get small average lookup time (lookup depth near 1.0), the hash * table size should be roughly the same (not necessarily equivalent) * as the cache size. */ new_size = MAX(skm->skm_hash_initial, (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2))); new_size = MIN(skm->skm_hash_limit, new_size); old_size = (skm->skm_hash_mask + 1); if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) { return; } new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size, Z_NOWAIT, skmem_tag_bufctl_hash); if (__improbable(new_table == NULL)) { return; } for (i = 0; i < new_size; i++) { SLIST_INIT(&new_table[i].bcb_head); } SKM_SLAB_LOCK(skm); old_size = (skm->skm_hash_mask + 1); old_table = skm->skm_hash_table; skm->skm_hash_mask = (new_size - 1); skm->skm_hash_table = new_table; skm->skm_sl_rescale++; for (i = 0; i < old_size; i++) { struct skmem_bufctl_bkt *bcb = &old_table[i]; struct skmem_bufctl_bkt *new_bcb; struct skmem_bufctl *bc; while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) { SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link); new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr); /* * Ideally we want to insert tail here, but simple * list doesn't give us that. The fact that we are * essentially reversing the order is not a big deal * here vis-a-vis the new table size. */ SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link); ++moved; } ASSERT(SLIST_EMPTY(&bcb->bcb_head)); } SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm), (uint32_t)old_size, (uint32_t)new_size, moved); SKM_SLAB_UNLOCK(skm); sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table); } /* * Apply a function to operate on all caches. */ static void skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg) { struct skmem_cache *skm; net_update_uptime(); SKMEM_CACHE_LOCK(); TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) { func(skm, arg); } SKMEM_CACHE_UNLOCK(); } /* * Reclaim unused memory from a cache. */ static void skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem) { /* * Inform the owner to free memory if possible; the reclaim * policy is left to the owner. This is just an advisory. */ if (skm->skm_reclaim != NULL) { skm->skm_reclaim(skm->skm_private); } if (lowmem) { /* * If another thread is in the process of purging or * resizing, bail out and let the currently-ongoing * purging take its natural course. */ if (skmem_cache_resize_enter(skm, FALSE) == 0) { skmem_cache_magazine_purge(skm); skmem_cache_magazine_enable(skm, 0); skmem_cache_resize_exit(skm); } } else { skmem_depot_ws_reap(skm); } } /* * Thread call callback for reap. */ static void skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg) { #pragma unused(dummy) void (*func)(void) = arg; ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done); func(); } /* * Start reaping all caches; this is serialized via thread call. */ static void skmem_cache_reap_start(void) { SK_DF(SK_VERB_MEM_CACHE, "now running"); skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check()); skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done, (skmem_cache_update_interval * NSEC_PER_SEC)); } /* * Stop reaping; this would allow another reap request to occur. */ static void skmem_cache_reap_done(void) { volatile uint32_t *flag = &skmem_cache_reaping; *flag = 0; os_atomic_thread_fence(seq_cst); } /* * Immediately reap all unused memory of a cache. If purging, * also purge the cached objects at the CPU layer. */ void skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge) { /* if SKM_MODE_RECLIAM flag is set for this cache, we purge */ if (purge || (skm->skm_mode & SKM_MODE_RECLAIM)) { /* * If another thread is in the process of purging or * resizing, bail out and let the currently-ongoing * purging take its natural course. */ if (skmem_cache_resize_enter(skm, FALSE) == 0) { skmem_cache_magazine_purge(skm); skmem_cache_magazine_enable(skm, 0); skmem_cache_resize_exit(skm); } } else { skmem_depot_ws_zero(skm); skmem_depot_ws_reap(skm); /* clean up cp_ploaded magazines from each CPU */ SKM_SLAB_LOCK_ASSERT_NOTHELD(skm); struct skmem_cpu_cache *cp; struct skmem_mag *pmg; int prounds; uint32_t cpuid; for (cpuid = 0; cpuid < ncpu; cpuid++) { cp = &skm->skm_cpu_cache[cpuid]; SKM_CPU_LOCK_SPIN(cp); pmg = cp->cp_ploaded; prounds = cp->cp_prounds; cp->cp_ploaded = NULL; cp->cp_prounds = -1; SKM_CPU_UNLOCK(cp); if (pmg != NULL) { skmem_magazine_destroy(skm, pmg, prounds); } } } } /* * Request a global reap operation to be dispatched. */ void skmem_cache_reap(void) { /* only one reaping episode is allowed at a time */ if (skmem_lock_owner == current_thread() || !os_atomic_cmpxchg(&skmem_cache_reaping, 0, 1, acq_rel)) { return; } skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0); } /* * Reap internal caches. */ void skmem_reap_caches(boolean_t purge) { skmem_cache_reap_now(skmem_slab_cache, purge); skmem_cache_reap_now(skmem_bufctl_cache, purge); /* packet buffer pool objects */ pp_reap_caches(purge); /* also handle the region cache(s) */ skmem_region_reap_caches(purge); } /* * Thread call callback for update. */ static void skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg) { #pragma unused(dummy, arg) sk_protect_t protect; protect = sk_cache_update_protect(); skmem_cache_applyall(skmem_cache_update, 0); sk_cache_update_unprotect(protect); skmem_dispatch(skmem_cache_update_tc, NULL, (skmem_cache_update_interval * NSEC_PER_SEC)); } /* * Given a buffer control, record the current transaction. */ __attribute__((noinline, cold, not_tail_called)) static inline void skmem_audit_bufctl(struct skmem_bufctl *bc) { struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc; struct timeval tv; microuptime(&tv); bca->bc_thread = current_thread(); bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000)); bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH); } /* * Given an object, find its buffer control and record the transaction. */ __attribute__((noinline, cold, not_tail_called)) static inline void skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list) { struct skmem_bufctl_bkt *bcb; struct skmem_bufctl *bc; ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO)); SKM_SLAB_LOCK(skm); while (list != NULL) { void *buf = list; bcb = SKMEM_CACHE_HASH(skm, buf); SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) { if (bc->bc_addr == buf) { break; } } if (__improbable(bc == NULL)) { panic("%s: %s failed to get bufctl for %p", __func__, skm->skm_name, buf); /* NOTREACHED */ __builtin_unreachable(); } skmem_audit_bufctl(bc); if (!(skm->skm_mode & SKM_MODE_BATCH)) { break; } list = list->mo_next; } SKM_SLAB_UNLOCK(skm); } static size_t skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len) { size_t actual_space = sizeof(struct sk_stats_cache); struct sk_stats_cache *sca = out; int contention; if (out == NULL || len < actual_space) { goto done; } bzero(sca, sizeof(*sca)); (void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s", skm->skm_name); uuid_copy(sca->sca_uuid, skm->skm_uuid); uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid); sca->sca_mode = skm->skm_mode; sca->sca_bufsize = (uint64_t)skm->skm_bufsize; sca->sca_objsize = (uint64_t)skm->skm_objsize; sca->sca_chunksize = (uint64_t)skm->skm_chunksize; sca->sca_slabsize = (uint64_t)skm->skm_slabsize; sca->sca_bufalign = (uint64_t)skm->skm_bufalign; sca->sca_objalign = (uint64_t)skm->skm_objalign; sca->sca_cpu_mag_size = skm->skm_cpu_mag_size; sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize; sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge; sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap; sca->sca_depot_full = skm->skm_depot_full; sca->sca_depot_empty = skm->skm_depot_empty; sca->sca_depot_ws_zero = skm->skm_depot_ws_zero; /* in case of a race this might be a negative value, turn it into 0 */ if ((contention = (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev)) < 0) { contention = 0; } sca->sca_depot_contention_factor = contention; sca->sca_cpu_rounds = 0; sca->sca_cpu_prounds = 0; for (int cpuid = 0; cpuid < ncpu; cpuid++) { struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid]; SKM_CPU_LOCK(ccp); if (ccp->cp_rounds > -1) { sca->sca_cpu_rounds += ccp->cp_rounds; } if (ccp->cp_prounds > -1) { sca->sca_cpu_prounds += ccp->cp_prounds; } SKM_CPU_UNLOCK(ccp); } sca->sca_sl_create = skm->skm_sl_create; sca->sca_sl_destroy = skm->skm_sl_destroy; sca->sca_sl_alloc = skm->skm_sl_alloc; sca->sca_sl_free = skm->skm_sl_free; sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail; sca->sca_sl_partial = skm->skm_sl_partial; sca->sca_sl_empty = skm->skm_sl_empty; sca->sca_sl_bufinuse = skm->skm_sl_bufinuse; sca->sca_sl_rescale = skm->skm_sl_rescale; sca->sca_sl_hash_size = (skm->skm_hash_mask + 1); done: return actual_space; } static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2, oidp) struct skmem_cache *skm; size_t actual_space; size_t buffer_space; size_t allocated_space; caddr_t buffer = NULL; caddr_t scan; int error = 0; if (!kauth_cred_issuser(kauth_cred_get())) { return EPERM; } net_update_uptime(); buffer_space = req->oldlen; if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) { if (buffer_space > SK_SYSCTL_ALLOC_MAX) { buffer_space = SK_SYSCTL_ALLOC_MAX; } allocated_space = buffer_space; buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib); if (__improbable(buffer == NULL)) { return ENOBUFS; } } else if (req->oldptr == USER_ADDR_NULL) { buffer_space = 0; } actual_space = 0; scan = buffer; SKMEM_CACHE_LOCK(); TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) { size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space); if (scan != NULL) { if (buffer_space < size) { /* supplied buffer too small, stop copying */ error = ENOMEM; break; } scan += size; buffer_space -= size; } actual_space += size; } SKMEM_CACHE_UNLOCK(); if (actual_space != 0) { int out_error = SYSCTL_OUT(req, buffer, actual_space); if (out_error != 0) { error = out_error; } } if (buffer != NULL) { sk_free_data(buffer, allocated_space); } return error; }