/* * Copyright (c) 2016-2022 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* BEGIN CSTYLED */ /* * A region represents a collection of one or more similarly-sized memory * segments, each of which is a contiguous range of integers. A segment * is either allocated or free, and is treated as disjoint from all other * segments. That is, the contiguity applies only at the segment level, * and a region with multiple segments is not contiguous at the region level. * A segment always belongs to the segment freelist, or the allocated-address * hash chain, as described below. * * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region * stays intact even after a defunct. Otherwise, the segments belonging * to the region will be freed at defunct time, and the span covered by * the region will be redirected to zero-filled anonymous memory. * * Memory for a region is always created as pageable and purgeable. It is * the client's responsibility to prepare (wire) it, and optionally insert * it to the IOMMU, at segment construction time. When the segment is * freed, the client is responsible for removing it from IOMMU (if needed), * and complete (unwire) it. * * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory * is immediately wired upon allocation (segment removed from freelist). * It gets unwired when memory is discarded (segment inserted to freelist). * * The chronological life cycle of a segment is as such: * * SKSEG_STATE_DETACHED * SKSEG_STATE_{MAPPED,MAPPED_WIRED} * [segment allocated, useable by client] * ... * [client frees segment] * SKSEG_STATE_{MAPPED,MAPPED_WIRED} * [reclaim] * SKSEG_STATE_DETACHED * * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK); * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to * prevent modifications by the user task. Only user-mappable regions will * be considered for inclusion during skmem_arena_mmap(). * * Every skmem allocator has a region as its slab supplier. Each slab is * exactly a segment. The allocator uses skmem_region_{alloc,free}() to * create and destroy slabs. * * A region may be mirrored by another region; the latter acts as the master * controller for both regions. Mirrored (slave) regions cannot be used * directly by the skmem allocator. Region mirroring technique is used for * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one * region has the same size and lifetime as its shadow counterpart. * * CREATION/DESTRUCTION: * * At creation time, all segments are allocated and are immediately inserted * into the freelist. Allocating a purgeable segment has very little cost, * as it is not backed by physical memory until it is accessed. Immediate * insertion into the freelist causes the mapping to be further torn down. * * At destruction time, the freelist is emptied, and each segment is then * destroyed. The system will assert if it detects there are outstanding * segments not yet returned to the region (not freed by the client.) * * ALLOCATION: * * Allocating involves searching the freelist for a segment; if found, the * segment is removed from the freelist and is inserted into the allocated- * address hash chain. The address of the memory object represented by * the segment is used as hash key. The use of allocated-address hash chain * is needed since we return the address of the memory object, and not the * segment's itself, to the client. * * DEALLOCATION: * * Freeing a memory object causes the chain to be searched for a matching * segment. The system will assert if a segment cannot be found, since * that indicates that the memory object address is invalid. Once found, * the segment is removed from the allocated-address hash chain, and is * inserted to the freelist. * * Segment allocation and deallocation can be expensive. Because of this, * we expect that most clients will utilize the skmem_cache slab allocator * as the frontend instead. */ /* END CSTYLED */ #include #define _FN_KPRINTF /* don't redefine kprintf() */ #include /* for PE_parse_boot_argn */ static void skmem_region_destroy(struct skmem_region *skr); static void skmem_region_depopulate(struct skmem_region *); static int sksegment_cmp(const struct sksegment *, const struct sksegment *); static struct sksegment *sksegment_create(struct skmem_region *, uint32_t); static void sksegment_destroy(struct skmem_region *, struct sksegment *); static void sksegment_freelist_insert(struct skmem_region *, struct sksegment *, boolean_t); static struct sksegment *sksegment_freelist_remove(struct skmem_region *, struct sksegment *, uint32_t, boolean_t); static struct sksegment *sksegment_freelist_grow(struct skmem_region *); static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *, uint32_t); static void *skmem_region_alloc_common(struct skmem_region *, struct sksegment *); static void *skmem_region_mirror_alloc(struct skmem_region *, struct sksegment *, struct sksegment **); static void skmem_region_applyall(void (*)(struct skmem_region *)); static void skmem_region_update(struct skmem_region *); static void skmem_region_update_func(thread_call_param_t, thread_call_param_t); static inline void skmem_region_retain_locked(struct skmem_region *); static inline boolean_t skmem_region_release_locked(struct skmem_region *); static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS; RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp); RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp); SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region", "Skywalk region statistics"); static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0); static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region"); static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp, &skmem_region_lock_attr); /* protected by skmem_region_lock */ static TAILQ_HEAD(, skmem_region) skmem_region_head; static thread_call_t skmem_region_update_tc; #define SKMEM_REGION_UPDATE_INTERVAL 13 /* 13 seconds */ static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL; #define SKMEM_WDT_MAXTIME 30 /* # of secs before watchdog */ #define SKMEM_WDT_PURGE 3 /* retry purge threshold */ #if (DEVELOPMENT || DEBUG) /* Mean Time Between Failures (ms) */ static volatile uint64_t skmem_region_mtbf; static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int, struct sysctl_req *); SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)"); SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval, SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)"); #endif /* (DEVELOPMENT || DEBUG) */ #define SKMEM_REGION_LOCK() \ lck_mtx_lock(&skmem_region_lock) #define SKMEM_REGION_LOCK_ASSERT_HELD() \ LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED) #define SKMEM_REGION_LOCK_ASSERT_NOTHELD() \ LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED) #define SKMEM_REGION_UNLOCK() \ lck_mtx_unlock(&skmem_region_lock) /* * Hash table bounds. Start with the initial value, and rescale up to * the specified limit. Ideally we don't need a limit, but in practice * this helps guard against runaways. These values should be revisited * in future and be adjusted as needed. */ #define SKMEM_REGION_HASH_INITIAL 32 /* initial hash table size */ #define SKMEM_REGION_HASH_LIMIT 4096 /* hash table size limit */ #define SKMEM_REGION_HASH_INDEX(_a, _s, _m) \ (((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m)) #define SKMEM_REGION_HASH(_skr, _addr) \ (&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \ (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)]) static SKMEM_TYPE_DEFINE(skr_zone, struct skmem_region); static unsigned int sg_size; /* size of zone element */ static struct skmem_cache *skmem_sg_cache; /* cache for sksegment */ static uint32_t skmem_seg_size = SKMEM_SEG_SIZE; static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE; static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE; static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE; uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE; #define SKMEM_TAG_SEGMENT_BMAP "com.apple.skywalk.segment.bmap" static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP); #define SKMEM_TAG_SEGMENT_HASH "com.apple.skywalk.segment.hash" static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH); #define SKMEM_TAG_REGION_MIB "com.apple.skywalk.region.mib" static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB); #define BMAPSZ 64 /* 64-bit mask with range */ #define BMASK64(_beg, _end) \ ((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1)) static int __skmem_region_inited = 0; void skmem_region_init(void) { boolean_t randomize_seg_size; _CASSERT(sizeof(bitmap_t) == sizeof(uint64_t)); _CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3)); _CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0); _CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL); ASSERT(!__skmem_region_inited); /* enforce the ordering here */ _CASSERT(SKMEM_REGION_GUARD_HEAD == 0); _CASSERT(SKMEM_REGION_SCHEMA == 1); _CASSERT(SKMEM_REGION_RING == 2); _CASSERT(SKMEM_REGION_BUF_DEF == 3); _CASSERT(SKMEM_REGION_BUF_LARGE == 4); _CASSERT(SKMEM_REGION_RXBUF_DEF == 5); _CASSERT(SKMEM_REGION_RXBUF_LARGE == 6); _CASSERT(SKMEM_REGION_TXBUF_DEF == 7); _CASSERT(SKMEM_REGION_TXBUF_LARGE == 8); _CASSERT(SKMEM_REGION_UMD == 9); _CASSERT(SKMEM_REGION_TXAUSD == 10); _CASSERT(SKMEM_REGION_RXFUSD == 11); _CASSERT(SKMEM_REGION_UBFT == 12); _CASSERT(SKMEM_REGION_USTATS == 13); _CASSERT(SKMEM_REGION_FLOWADV == 14); _CASSERT(SKMEM_REGION_NEXUSADV == 15); _CASSERT(SKMEM_REGION_SYSCTLS == 16); _CASSERT(SKMEM_REGION_GUARD_TAIL == 17); _CASSERT(SKMEM_REGION_KMD == 18); _CASSERT(SKMEM_REGION_RXKMD == 19); _CASSERT(SKMEM_REGION_TXKMD == 20); _CASSERT(SKMEM_REGION_KBFT == 21); _CASSERT(SKMEM_REGION_RXKBFT == 22); _CASSERT(SKMEM_REGION_TXKBFT == 23); _CASSERT(SKMEM_REGION_TXAKSD == 24); _CASSERT(SKMEM_REGION_RXFKSD == 25); _CASSERT(SKMEM_REGION_KSTATS == 26); _CASSERT(SKMEM_REGION_INTRINSIC == 27); _CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD); _CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA); _CASSERT(SREG_RING == SKMEM_REGION_RING); _CASSERT(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF); _CASSERT(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE); _CASSERT(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF); _CASSERT(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE); _CASSERT(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF); _CASSERT(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE); _CASSERT(SREG_UMD == SKMEM_REGION_UMD); _CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD); _CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD); _CASSERT(SREG_UBFT == SKMEM_REGION_UBFT); _CASSERT(SREG_USTATS == SKMEM_REGION_USTATS); _CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV); _CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV); _CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS); _CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL); _CASSERT(SREG_KMD == SKMEM_REGION_KMD); _CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD); _CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD); _CASSERT(SREG_KBFT == SKMEM_REGION_KBFT); _CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT); _CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT); _CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD); _CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD); _CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS); _CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT); _CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK); _CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY); _CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY); _CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT); _CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC); _CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES); _CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE); _CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN); _CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT); _CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD); _CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG); _CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK); _CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA); _CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO); _CASSERT(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE); _CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB); _CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED); (void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size, sizeof(skmem_seg_size)); if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) { skmem_seg_size = SKMEM_MIN_SEG_SIZE; } skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size, SKMEM_MIN_SEG_SIZE); VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0); (void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size, sizeof(skmem_md_seg_size)); if (skmem_md_seg_size < skmem_seg_size) { skmem_md_seg_size = skmem_seg_size; } skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size, SKMEM_MIN_SEG_SIZE); VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0); /* * If set via boot-args, honor it and don't randomize. */ randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size", &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size)); if (skmem_drv_buf_seg_size < skmem_seg_size) { skmem_drv_buf_seg_size = skmem_seg_size; } skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size = (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE); VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0); /* * Randomize the driver buffer segment size; here we choose * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to. * Set this as the effective driver buffer segment size. */ if (randomize_seg_size) { uint32_t sm; read_frandom(&sm, sizeof(sm)); skmem_drv_buf_seg_eff_size += (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER)); VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0); } VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size); (void) PE_parse_boot_argn("skmem_usr_buf_seg_size", &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size)); if (skmem_usr_buf_seg_size < skmem_seg_size) { skmem_usr_buf_seg_size = skmem_seg_size; } skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size, SKMEM_MIN_SEG_SIZE); VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0); SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], " "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size, skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size, skmem_usr_buf_seg_size); TAILQ_INIT(&skmem_region_head); skmem_region_update_tc = thread_call_allocate_with_options(skmem_region_update_func, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); if (skmem_region_update_tc == NULL) { panic("%s: thread_call_allocate failed", __func__); /* NOTREACHED */ __builtin_unreachable(); } sg_size = sizeof(struct sksegment); skmem_sg_cache = skmem_cache_create("sg", sg_size, sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0); /* and start the periodic region update machinery */ skmem_dispatch(skmem_region_update_tc, NULL, (skmem_region_update_interval * NSEC_PER_SEC)); __skmem_region_inited = 1; } void skmem_region_fini(void) { if (__skmem_region_inited) { ASSERT(TAILQ_EMPTY(&skmem_region_head)); if (skmem_region_update_tc != NULL) { (void) thread_call_cancel_wait(skmem_region_update_tc); (void) thread_call_free(skmem_region_update_tc); skmem_region_update_tc = NULL; } if (skmem_sg_cache != NULL) { skmem_cache_destroy(skmem_sg_cache); skmem_sg_cache = NULL; } __skmem_region_inited = 0; } } /* * Reap internal caches. */ void skmem_region_reap_caches(boolean_t purge) { skmem_cache_reap_now(skmem_sg_cache, purge); } /* * Configure and compute the parameters of a region. */ void skmem_region_params_config(struct skmem_region_params *srp) { uint32_t cache_line_size = skmem_cpu_cache_line_size(); size_t seglim, segsize, segcnt; size_t objsize, objcnt; ASSERT(srp->srp_id < SKMEM_REGIONS); /* * If magazines layer is disabled system-wide, override * the region parameter here. This will effectively reduce * the number of requested objects computed below. Note that * the region may have already been configured to exclude * magazines in the default skmem_regions[] array. */ if (!skmem_allow_magazines()) { srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES; } objsize = srp->srp_r_obj_size; ASSERT(objsize != 0); objcnt = srp->srp_r_obj_cnt; ASSERT(objcnt != 0); if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) { size_t align = srp->srp_align; VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0); VERIFY(powerof2(align)); objsize = MAX(objsize, sizeof(uint64_t)); #if KASAN /* * When KASAN is enabled, the zone allocator adjusts the * element size to include the redzone regions, in which * case we assume that the elements won't start on the * alignment boundary and thus need to do some fix-ups. * These include increasing the effective object size * which adds at least 16 bytes to the original size. */ objsize += sizeof(uint64_t) + align; #endif /* KASAN */ objsize = P2ROUNDUP(objsize, align); segsize = objsize; srp->srp_r_seg_size = (uint32_t)segsize; segcnt = objcnt; goto done; } else { /* objects are always aligned at CPU cache line size */ srp->srp_align = cache_line_size; } /* * Start with default segment size for the region, and compute the * effective segment size (to nearest SKMEM_MIN_SEG_SIZE). If the * object size is greater, then we adjust the segment size to next * multiple of the effective size larger than the object size. */ if (srp->srp_r_seg_size == 0) { switch (srp->srp_id) { case SKMEM_REGION_UMD: case SKMEM_REGION_KMD: case SKMEM_REGION_RXKMD: case SKMEM_REGION_TXKMD: srp->srp_r_seg_size = skmem_md_seg_size; break; case SKMEM_REGION_BUF_DEF: case SKMEM_REGION_RXBUF_DEF: case SKMEM_REGION_TXBUF_DEF: /* * Use the effective driver buffer segment size, * since it reflects any randomization done at * skmem_region_init() time. */ srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size; break; default: srp->srp_r_seg_size = skmem_seg_size; break; } } else { srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size, SKMEM_MIN_SEG_SIZE); } seglim = srp->srp_r_seg_size; VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0); SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu", srp->srp_name, seglim, objsize, objcnt); /* * Make sure object size is multiple of CPU cache line * size, and that we can evenly divide the segment size. */ if (!((objsize < cache_line_size) && (objsize < seglim) && ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) { objsize = P2ROUNDUP(objsize, cache_line_size); while (objsize < seglim && (seglim % objsize) != 0) { SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu", srp->srp_name, objsize, objsize + cache_line_size); objsize += cache_line_size; } } /* segment must be larger than object */ while (objsize > seglim) { SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name, seglim, seglim + SKMEM_MIN_SEG_SIZE); seglim += SKMEM_MIN_SEG_SIZE; } /* * Take into account worst-case per-CPU cached * objects if this region is configured for it. */ if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) { uint32_t magazine_max_objs = skmem_cache_magazine_max((uint32_t)objsize); SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name, objcnt, objcnt + magazine_max_objs); objcnt += magazine_max_objs; } SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu " "objcnt %zu", srp->srp_name, seglim, objsize, objcnt); segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE); if (seglim > segsize) { /* * If the segment limit is larger than what we need, * avoid memory wastage by shrinking it. */ while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) { VERIFY(seglim >= SKMEM_MIN_SEG_SIZE); SK_DF(SK_VERB_MEM, "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu", srp->srp_name, segsize, objsize, objcnt, seglim, P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE, SKMEM_MIN_SEG_SIZE)); seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE, SKMEM_MIN_SEG_SIZE); } /* adjust segment size */ segsize = seglim; } else if (seglim < segsize) { size_t oseglim = seglim; /* * If the segment limit is less than the segment size, * see if increasing it slightly (up to 1.5x the segment * size) would allow us to avoid allocating too many * extra objects (due to excessive segment count). */ while (seglim < segsize && (segsize % seglim) != 0) { SK_DF(SK_VERB_MEM, "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu", srp->srp_name, segsize, objsize, objcnt, seglim, (seglim + SKMEM_MIN_SEG_SIZE)); seglim += SKMEM_MIN_SEG_SIZE; if (seglim >= (oseglim + (oseglim >> 1))) { break; } } /* can't use P2ROUNDUP since seglim may not be power of 2 */ segsize = SK_ROUNDUP(segsize, seglim); } ASSERT(segsize != 0 && (segsize % seglim) == 0); SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu", srp->srp_name, segsize, seglim); /* compute segment count, and recompute segment size */ if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) { segcnt = 1; } else { /* * The adjustments above were done in increments of * SKMEM_MIN_SEG_SIZE. If the object size is greater * than that, ensure that the segment size is a multiple * of the object size. */ if (objsize > SKMEM_MIN_SEG_SIZE) { ASSERT(seglim >= objsize); if ((seglim % objsize) != 0) { seglim += (seglim - objsize); } /* recompute segsize; see SK_ROUNDUP comment above */ segsize = SK_ROUNDUP(segsize, seglim); } segcnt = MAX(1, (segsize / seglim)); segsize /= segcnt; } SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu", srp->srp_name, segcnt, segsize); /* recompute object count to avoid wastage */ objcnt = (segsize * segcnt) / objsize; ASSERT(objcnt != 0); done: srp->srp_c_obj_size = (uint32_t)objsize; srp->srp_c_obj_cnt = (uint32_t)objcnt; srp->srp_c_seg_size = (uint32_t)segsize; srp->srp_seg_cnt = (uint32_t)segcnt; SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu", srp->srp_name, objsize, objcnt, segcnt, segsize); #if SK_LOG if (__improbable(sk_verbose != 0)) { char label[32]; (void) snprintf(label, sizeof(label), "REGION_%s:", skmem_region_id2name(srp->srp_id)); SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label, (uint32_t)srp->srp_r_obj_cnt, (uint32_t)srp->srp_r_obj_size, (uint32_t)srp->srp_c_obj_cnt, (uint32_t)srp->srp_c_obj_size); } #endif /* SK_LOG */ } /* * Create a region. */ struct skmem_region * skmem_region_create(const char *name, struct skmem_region_params *srp, sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private) { boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO); uint32_t cflags = srp->srp_cflags; struct skmem_region *skr; uint32_t i; ASSERT(srp->srp_id < SKMEM_REGIONS); ASSERT(srp->srp_c_seg_size != 0 && (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0)); ASSERT(srp->srp_seg_cnt != 0); ASSERT(srp->srp_c_obj_cnt == 1 || (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0); ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size); skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO); skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size; skr->skr_seg_size = srp->srp_c_seg_size; skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt); skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size); if (!pseudo) { skr->skr_seg_max_cnt = srp->srp_seg_cnt; /* set alignment to CPU cache line size */ skr->skr_params.srp_align = skmem_cpu_cache_line_size(); /* allocate the allocated-address hash chain */ skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL; skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT; skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt, skr->skr_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_segment_hash); skr->skr_hash_mask = (skr->skr_hash_initial - 1); skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1; for (i = 0; i < (skr->skr_hash_mask + 1); i++) { TAILQ_INIT(&skr->skr_hash_table[i].sgb_head); } } else { /* this upper bound doesn't apply */ skr->skr_seg_max_cnt = 0; /* pick up value set by skmem_regions_params_config() */ skr->skr_params.srp_align = srp->srp_align; } skr->skr_r_obj_size = srp->srp_r_obj_size; skr->skr_r_obj_cnt = srp->srp_r_obj_cnt; skr->skr_c_obj_size = srp->srp_c_obj_size; skr->skr_c_obj_cnt = srp->srp_c_obj_cnt; skr->skr_params.srp_md_type = srp->srp_md_type; skr->skr_params.srp_md_subtype = srp->srp_md_subtype; skr->skr_params.srp_max_frags = srp->srp_max_frags; skr->skr_seg_ctor = ctor; skr->skr_seg_dtor = dtor; skr->skr_private = private; lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp, &skmem_region_lock_attr); TAILQ_INIT(&skr->skr_seg_free); RB_INIT(&skr->skr_seg_tfree); skr->skr_id = srp->srp_id; uuid_generate_random(skr->skr_uuid); (void) snprintf(skr->skr_name, sizeof(skr->skr_name), "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name); SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ", skr->skr_name, SK_KVA(skr)); /* sanity check */ ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) || !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY | SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK | SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT | SKMEM_REGION_CR_PUREDATA))); skr->skr_cflags = cflags; if (cflags & SKMEM_REGION_CR_NOREDIRECT) { skr->skr_mode |= SKR_MODE_NOREDIRECT; } if (cflags & SKMEM_REGION_CR_MMAPOK) { skr->skr_mode |= SKR_MODE_MMAPOK; } if ((cflags & SKMEM_REGION_CR_MMAPOK) && (cflags & SKMEM_REGION_CR_UREADONLY)) { skr->skr_mode |= SKR_MODE_UREADONLY; } if (cflags & SKMEM_REGION_CR_KREADONLY) { skr->skr_mode |= SKR_MODE_KREADONLY; } if (cflags & SKMEM_REGION_CR_PERSISTENT) { skr->skr_mode |= SKR_MODE_PERSISTENT; } if (cflags & SKMEM_REGION_CR_MONOLITHIC) { skr->skr_mode |= SKR_MODE_MONOLITHIC; } if (cflags & SKMEM_REGION_CR_NOMAGAZINES) { skr->skr_mode |= SKR_MODE_NOMAGAZINES; } if (cflags & SKMEM_REGION_CR_NOCACHE) { skr->skr_mode |= SKR_MODE_NOCACHE; } if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) { skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG; } if (cflags & SKMEM_REGION_CR_SHAREOK) { skr->skr_mode |= SKR_MODE_SHAREOK; } if (cflags & SKMEM_REGION_CR_IODIR_IN) { skr->skr_mode |= SKR_MODE_IODIR_IN; } if (cflags & SKMEM_REGION_CR_IODIR_OUT) { skr->skr_mode |= SKR_MODE_IODIR_OUT; } if (cflags & SKMEM_REGION_CR_GUARD) { skr->skr_mode |= SKR_MODE_GUARD; } if (cflags & SKMEM_REGION_CR_PUREDATA) { skr->skr_mode |= SKR_MODE_PUREDATA; } if (cflags & SKMEM_REGION_CR_PSEUDO) { skr->skr_mode |= SKR_MODE_PSEUDO; } if (cflags & SKMEM_REGION_CR_THREADSAFE) { skr->skr_mode |= SKR_MODE_THREADSAFE; } if (cflags & SKMEM_REGION_CR_MEMTAG) { skr->skr_mode |= SKR_MODE_MEMTAG; } #if XNU_TARGET_OS_OSX /* * Mark all regions as persistent except for the guard and Intrinsic * regions. * This is to ensure that kernel threads won't be faulting-in while * accessing these memory regions. We have observed various kinds of * kernel panics due to kernel threads faulting on non-wired memory * access when the VM subsystem is not in a state to swap-in the page. */ if (!((skr->skr_mode & SKR_MODE_PSEUDO) || (skr->skr_mode & SKR_MODE_GUARD))) { skr->skr_mode |= SKR_MODE_PERSISTENT; } #endif /* XNU_TARGET_OS_OSX */ /* SKR_MODE_UREADONLY only takes effect for user task mapping */ skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY); skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY); skr->skr_bufspec.purgeable = TRUE; skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE); skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG); skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN); skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT); skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA); skr->skr_bufspec.threadSafe = !!(skr->skr_mode & SKR_MODE_THREADSAFE); skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT); /* allocate segment bitmaps */ if (!(skr->skr_mode & SKR_MODE_PSEUDO)) { ASSERT(skr->skr_seg_max_cnt != 0); skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt); skr->skr_seg_bmap = sk_alloc_data(BITMAP_SIZE(skr->skr_seg_max_cnt), Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap); ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) == (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap))); /* mark all bitmaps as free (bit set) */ bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt); } /* * Populate the freelist by allocating all segments for the * region, which will be mapped but not faulted-in, and then * immediately insert each to the freelist. That will in * turn unmap the segment's memory object. */ SKR_LOCK(skr); if (skr->skr_mode & SKR_MODE_PSEUDO) { char zone_name[64]; (void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s", SKMEM_ZONE_PREFIX, name); skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size, ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE); } else { /* create a backing IOSKRegion object */ if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec, (IOSKSize)skr->skr_seg_size, (IOSKCount)skr->skr_seg_max_cnt)) == NULL) { SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed", skr->skr_name, (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags, SKMEM_REGION_CR_BITS); goto failed; } } ASSERT(skr->skr_seg_objs != 0); ++skr->skr_refcnt; /* for caller */ SKR_UNLOCK(skr); SKMEM_REGION_LOCK(); TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link); SKMEM_REGION_UNLOCK(); SK_DF(SK_VERB_MEM_REGION, " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b", (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt, (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt, skr->skr_cflags, SKMEM_REGION_CR_BITS); return skr; failed: SKR_LOCK_ASSERT_HELD(skr); skmem_region_destroy(skr); return NULL; } /* * Destroy a region. */ static void skmem_region_destroy(struct skmem_region *skr) { struct skmem_region *mskr; SKR_LOCK_ASSERT_HELD(skr); SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx", skr->skr_name, SK_KVA(skr)); /* * Panic if we detect there are unfreed segments; the caller * destroying this region is responsible for ensuring that all * allocated segments have been freed prior to getting here. */ ASSERT(skr->skr_refcnt == 0); if (skr->skr_seginuse != 0) { panic("%s: '%s' (%p) not empty (%u unfreed)", __func__, skr->skr_name, (void *)skr, skr->skr_seginuse); /* NOTREACHED */ __builtin_unreachable(); } if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) { SKR_UNLOCK(skr); SKMEM_REGION_LOCK(); TAILQ_REMOVE(&skmem_region_head, skr, skr_link); SKMEM_REGION_UNLOCK(); SKR_LOCK(skr); ASSERT(skr->skr_refcnt == 0); } /* * Undo what's done earlier at region creation time. */ skmem_region_depopulate(skr); ASSERT(TAILQ_EMPTY(&skr->skr_seg_free)); ASSERT(RB_EMPTY(&skr->skr_seg_tfree)); ASSERT(skr->skr_seg_free_cnt == 0); if (skr->skr_reg != NULL) { ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); IOSKRegionDestroy(skr->skr_reg); skr->skr_reg = NULL; } if (skr->skr_zreg != NULL) { ASSERT(skr->skr_mode & SKR_MODE_PSEUDO); zdestroy(skr->skr_zreg); skr->skr_zreg = NULL; } if (skr->skr_seg_bmap != NULL) { ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); #if (DEBUG || DEVELOPMENT) ASSERT(skr->skr_seg_bmap_len != 0); /* must have been set to vacant (bit set) by now */ assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt)); #endif /* DEBUG || DEVELOPMENT */ sk_free_data(skr->skr_seg_bmap, BITMAP_SIZE(skr->skr_seg_max_cnt)); skr->skr_seg_bmap = NULL; skr->skr_seg_bmap_len = 0; } ASSERT(skr->skr_seg_bmap_len == 0); if (skr->skr_hash_table != NULL) { ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); #if (DEBUG || DEVELOPMENT) for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) { ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head)); } #endif /* DEBUG || DEVELOPMENT */ sk_free_type_array(struct sksegment_bkt, skr->skr_hash_mask + 1, skr->skr_hash_table); skr->skr_hash_table = NULL; } if ((mskr = skr->skr_mirror) != NULL) { ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); skr->skr_mirror = NULL; mskr->skr_mode &= ~SKR_MODE_MIRRORED; } SKR_UNLOCK(skr); if (mskr != NULL) { skmem_region_release(mskr); } lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp); zfree(skr_zone, skr); } /* * Mirror mskr (slave) to skr (master). */ void skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr) { SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ", SK_KVA(skr), SK_KVA(mskr)); SKR_LOCK(skr); ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED)); ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED)); ASSERT(skr->skr_mirror == NULL); /* both regions must share identical parameters */ ASSERT(skr->skr_size == mskr->skr_size); ASSERT(skr->skr_seg_size == mskr->skr_seg_size); ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt); skr->skr_mirror = mskr; skmem_region_retain(mskr); mskr->skr_mode |= SKR_MODE_MIRRORED; SKR_UNLOCK(skr); } void skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm, bool attach) { int i; SKR_LOCK(skr); if (attach) { for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != NULL; i++) { ; } VERIFY(i < SKR_MAX_CACHES); ASSERT(skr->skr_cache[i] == NULL); skr->skr_mode |= SKR_MODE_SLAB; skr->skr_cache[i] = skm; skmem_region_retain_locked(skr); SKR_UNLOCK(skr); } else { ASSERT(skr->skr_mode & SKR_MODE_SLAB); for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != skm; i++) { ; } VERIFY(i < SKR_MAX_CACHES); ASSERT(skr->skr_cache[i] == skm); skr->skr_cache[i] = NULL; for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] == NULL; i++) { ; } if (i == SKR_MAX_CACHES) { skr->skr_mode &= ~SKR_MODE_SLAB; } if (!skmem_region_release_locked(skr)) { SKR_UNLOCK(skr); } } } /* * Common routines for skmem_region_{alloc,mirror_alloc}. */ static void * skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg) { struct sksegment_bkt *sgb; void *addr; SKR_LOCK_ASSERT_HELD(skr); ASSERT(sg->sg_md != NULL); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); addr = (void *)sg->sg_start; sgb = SKMEM_REGION_HASH(skr, addr); ASSERT(sg->sg_link.tqe_next == NULL); ASSERT(sg->sg_link.tqe_prev == NULL); TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link); skr->skr_seginuse++; skr->skr_meminuse += skr->skr_seg_size; if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) { skr->skr_w_meminuse += skr->skr_seg_size; } skr->skr_alloc++; return addr; } /* * Allocate a segment from the region. */ void * skmem_region_alloc(struct skmem_region *skr, void **maddr, struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag) { struct sksegment *sg = NULL; struct sksegment *sg1 = NULL; void *addr = NULL, *addr1 = NULL; uint32_t retries = 0; VERIFY(!(skr->skr_mode & SKR_MODE_GUARD)); if (retsg != NULL) { *retsg = NULL; } if (retsgm != NULL) { *retsgm = NULL; } /* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */ VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) != (SKMEM_NOSLEEP | SKMEM_FAILOK)); SKR_LOCK(skr); while (sg == NULL) { /* see if there's a segment in the freelist */ sg = TAILQ_FIRST(&skr->skr_seg_free); if (sg == NULL) { /* see if we can grow the freelist */ sg = sksegment_freelist_grow(skr); if (sg != NULL) { break; } if (skr->skr_mode & SKR_MODE_SLAB) { SKR_UNLOCK(skr); /* * None found; it's possible that the slab * layer is caching extra amount, so ask * skmem_cache to reap/purge its caches. */ for (int i = 0; i < SKR_MAX_CACHES; i++) { if (skr->skr_cache[i] == NULL) { continue; } skmem_cache_reap_now(skr->skr_cache[i], TRUE); } SKR_LOCK(skr); /* * If we manage to get some freed, try again. */ if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) { continue; } } /* * Give up if this is a non-blocking allocation, * or if this is a blocking allocation but the * caller is willing to retry. */ if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) { break; } /* otherwise we wait until one is available */ ++skr->skr_seg_waiters; (void) msleep(&skr->skr_seg_free, &skr->skr_lock, (PZERO - 1), skr->skr_name, NULL); } } SKR_LOCK_ASSERT_HELD(skr); if (sg != NULL) { retry: /* * We have a segment; remove it from the freelist and * insert it into the allocated-address hash chain. * Note that this may return NULL if we can't allocate * the memory descriptor. */ if (sksegment_freelist_remove(skr, sg, skmflag, FALSE) == NULL) { ASSERT(sg->sg_state == SKSEG_STATE_DETACHED); ASSERT(sg->sg_md == NULL); ASSERT(sg->sg_start == 0 && sg->sg_end == 0); /* * If it's non-blocking allocation, simply just give * up and let the caller decide when to retry. Else, * it gets a bit complicated due to the contract we * have for blocking allocations with the client; the * most sensible thing to do here is to retry the * allocation ourselves. Note that we keep using the * same segment we originally got, since we only need * the memory descriptor to be allocated for it; thus * we make sure we don't release the region lock when * retrying allocation. Doing so is crucial when the * region is mirrored, since the segment indices on * both regions need to match. */ if (skmflag & SKMEM_NOSLEEP) { SK_ERR("\"%s\": failed to allocate segment " "(non-sleeping mode)", skr->skr_name); sg = NULL; } else { if (++retries > SKMEM_WDT_MAXTIME) { panic_plain("\"%s\": failed to " "allocate segment (sleeping mode) " "after %u retries\n\n%s", skr->skr_name, SKMEM_WDT_MAXTIME, skmem_dump(skr)); /* NOTREACHED */ __builtin_unreachable(); } else { SK_ERR("\"%s\": failed to allocate " "segment (sleeping mode): %u " "retries", skr->skr_name, retries); } if (skr->skr_mode & SKR_MODE_SLAB) { /* * We can't get any memory descriptor * for this segment; reap extra cached * objects from the slab layer and hope * that we get lucky next time around. * * XXX adi@apple.com: perhaps also * trigger the zone allocator to do * its garbage collection here? */ skmem_cache_reap(); } delay(1 * USEC_PER_SEC); /* 1 sec */ goto retry; } } if (sg != NULL) { /* insert to allocated-address hash chain */ addr = skmem_region_alloc_common(skr, sg); } } if (sg == NULL) { VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)); if (skmflag & SKMEM_PANIC) { VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) == SKMEM_NOSLEEP); /* * If is a failed non-blocking alloc and the caller * insists that it must be successful, then panic. */ panic_plain("\"%s\": skr 0x%p unable to satisfy " "mandatory allocation\n", skr->skr_name, skr); /* NOTREACHED */ __builtin_unreachable(); } else { /* * Give up if this is a non-blocking allocation, * or one where the caller is willing to handle * allocation failures. */ goto done; } } ASSERT((mach_vm_address_t)addr == sg->sg_start); #if SK_LOG SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", SK_KVA(skr), SK_KVA(sg)); if (skr->skr_mirror == NULL || !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) { SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); } else { SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored", sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); } #endif /* SK_LOG */ /* * If mirroring, allocate shadow object from slave region. */ if (skr->skr_mirror != NULL) { ASSERT(skr->skr_mirror != skr); ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED)); ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED); addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg, &sg1); ASSERT(addr1 != NULL); ASSERT(sg1 != NULL && sg1 != sg); ASSERT(sg1->sg_index == sg->sg_index); } done: SKR_UNLOCK(skr); /* return segment metadata to caller if asked (reference not needed) */ if (addr != NULL) { if (retsg != NULL) { *retsg = sg; } if (retsgm != NULL) { *retsgm = sg1; } } if (maddr != NULL) { *maddr = addr1; } return addr; } /* * Allocate a segment from a mirror region at the same index. While it * is somewhat a simplified variant of skmem_region_alloc, keeping it * separate allows us to avoid further convoluting that routine. */ static void * skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0, struct sksegment **retsg) { struct sksegment sg_key = { .sg_index = sg0->sg_index }; struct sksegment *sg = NULL; void *addr = NULL; ASSERT(skr->skr_mode & SKR_MODE_MIRRORED); ASSERT(skr->skr_mirror == NULL); ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC); if (retsg != NULL) { *retsg = NULL; } SKR_LOCK(skr); /* * See if we can find one in the freelist first. Otherwise, * create a new segment of the same index and add that to the * freelist. We would always get a segment since both regions * are synchronized when it comes to the indices of allocated * segments. */ sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key); if (sg == NULL) { sg = sksegment_alloc_with_idx(skr, sg0->sg_index); VERIFY(sg != NULL); } VERIFY(sg->sg_index == sg0->sg_index); /* * We have a segment; remove it from the freelist and insert * it into the allocated-address hash chain. This either * succeeds or panics (SKMEM_PANIC) when a memory descriptor * can't be allocated. * * TODO: consider retrying IOBMD allocation attempts if needed. */ sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE); VERIFY(sg != NULL); /* insert to allocated-address hash chain */ addr = skmem_region_alloc_common(skr, sg); #if SK_LOG SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", SK_KVA(skr), SK_KVA(sg)); SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); #endif /* SK_LOG */ SKR_UNLOCK(skr); /* return segment metadata to caller if asked (reference not needed) */ if (retsg != NULL) { *retsg = sg; } return addr; } /* * Free a segment to the region. */ void skmem_region_free(struct skmem_region *skr, void *addr, void *maddr) { struct sksegment_bkt *sgb; struct sksegment *sg, *tsg; VERIFY(!(skr->skr_mode & SKR_MODE_GUARD)); /* * Search the hash chain to find a matching segment for the * given address. If found, remove the segment from the * hash chain and insert it into the freelist. Otherwise, * we panic since the caller has given us a bogus address. */ SKR_LOCK(skr); sgb = SKMEM_REGION_HASH(skr, addr); TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) { ASSERT(sg->sg_start != 0 && sg->sg_end != 0); if (sg->sg_start == (mach_vm_address_t)addr) { TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link); sg->sg_link.tqe_next = NULL; sg->sg_link.tqe_prev = NULL; break; } } ASSERT(sg != NULL); if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) { ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size); skr->skr_w_meminuse -= skr->skr_seg_size; } sksegment_freelist_insert(skr, sg, FALSE); ASSERT(skr->skr_seginuse != 0); skr->skr_seginuse--; skr->skr_meminuse -= skr->skr_seg_size; skr->skr_free++; #if SK_LOG SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", SK_KVA(skr), SK_KVA(sg)); if (skr->skr_mirror == NULL || !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) { SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", sg->sg_index, SK_KVA(addr), SK_KVA((uintptr_t)addr + skr->skr_seg_size)); } else { SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored", sg->sg_index, SK_KVA(sg), SK_KVA(addr), SK_KVA((uintptr_t)addr + skr->skr_seg_size)); } #endif /* SK_LOG */ /* * If mirroring, also free shadow object in slave region. */ if (skr->skr_mirror != NULL) { ASSERT(maddr != NULL); ASSERT(skr->skr_mirror != skr); ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED)); ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED); skmem_region_free(skr->skr_mirror, maddr, NULL); } /* wake up any blocked threads waiting for a segment */ if (skr->skr_seg_waiters != 0) { SK_DF(SK_VERB_MEM_REGION, "sg 0x%llx waking up %u waiters", SK_KVA(sg), skr->skr_seg_waiters); skr->skr_seg_waiters = 0; wakeup(&skr->skr_seg_free); } SKR_UNLOCK(skr); } __attribute__((always_inline)) static inline void skmem_region_retain_locked(struct skmem_region *skr) { SKR_LOCK_ASSERT_HELD(skr); skr->skr_refcnt++; ASSERT(skr->skr_refcnt != 0); } /* * Retain a segment. */ void skmem_region_retain(struct skmem_region *skr) { SKR_LOCK(skr); skmem_region_retain_locked(skr); SKR_UNLOCK(skr); } __attribute__((always_inline)) static inline boolean_t skmem_region_release_locked(struct skmem_region *skr) { SKR_LOCK_ASSERT_HELD(skr); ASSERT(skr->skr_refcnt != 0); if (--skr->skr_refcnt == 0) { skmem_region_destroy(skr); return TRUE; } return FALSE; } /* * Release (and potentially destroy) a segment. */ boolean_t skmem_region_release(struct skmem_region *skr) { boolean_t lastref; SKR_LOCK(skr); if (!(lastref = skmem_region_release_locked(skr))) { SKR_UNLOCK(skr); } return lastref; } /* * Depopulate the segment freelist. */ static void skmem_region_depopulate(struct skmem_region *skr) { struct sksegment *sg, *tsg; SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ", skr->skr_name, SK_KVA(skr)); SKR_LOCK_ASSERT_HELD(skr); ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO)); TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) { struct sksegment *sg0; uint32_t i; i = sg->sg_index; sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE); VERIFY(sg0 == sg); sksegment_destroy(skr, sg); ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ)); } } /* * Free tree segment compare routine. */ static int sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2) { return sg1->sg_index - sg2->sg_index; } /* * Create a segment. * * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap. */ static struct sksegment * sksegment_create(struct skmem_region *skr, uint32_t i) { struct sksegment *sg = NULL; bitmap_t *bmap; SKR_LOCK_ASSERT_HELD(skr); ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); ASSERT(i < skr->skr_seg_max_cnt); ASSERT(skr->skr_reg != NULL); ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size)); bmap = &skr->skr_seg_bmap[i / BMAPSZ]; ASSERT(bit_test(*bmap, i % BMAPSZ)); sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP); bzero(sg, sg_size); sg->sg_region = skr; sg->sg_index = i; sg->sg_state = SKSEG_STATE_DETACHED; /* claim it (clear bit) */ bit_clear(*bmap, i % BMAPSZ); SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode, SKR_MODE_BITS); return sg; } /* * Destroy a segment. * * Set the bit for the segment's index in skr_seg_bmap bitmap, * indicating that it is now vacant. */ static void sksegment_destroy(struct skmem_region *skr, struct sksegment *sg) { uint32_t i = sg->sg_index; bitmap_t *bmap; SKR_LOCK_ASSERT_HELD(skr); ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); ASSERT(skr == sg->sg_region); ASSERT(skr->skr_reg != NULL); ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED); ASSERT(i < skr->skr_seg_max_cnt); bmap = &skr->skr_seg_bmap[i / BMAPSZ]; ASSERT(!bit_test(*bmap, i % BMAPSZ)); SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode, SKR_MODE_BITS); /* * Undo what's done earlier at segment creation time. */ ASSERT(sg->sg_md == NULL); ASSERT(sg->sg_start == 0 && sg->sg_end == 0); ASSERT(sg->sg_state == SKSEG_STATE_DETACHED); /* release it (set bit) */ bit_set(*bmap, i % BMAPSZ); skmem_cache_free(skmem_sg_cache, sg); } /* * Insert a segment into freelist (freeing the segment). */ static void sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg, boolean_t populating) { SKR_LOCK_ASSERT_HELD(skr); ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); ASSERT(sg->sg_type != SKSEG_TYPE_FREE); ASSERT(skr == sg->sg_region); ASSERT(skr->skr_reg != NULL); ASSERT(sg->sg_index < skr->skr_seg_max_cnt); /* * If the region is being populated, then we're done. */ if (__improbable(populating)) { ASSERT(sg->sg_md == NULL); ASSERT(sg->sg_start == 0 && sg->sg_end == 0); ASSERT(sg->sg_state == SKSEG_STATE_DETACHED); } else { IOSKMemoryBufferRef md; IOReturn err; ASSERT(sg->sg_md != NULL); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); /* * Let the client remove the memory from IOMMU, and unwire it. */ if (skr->skr_seg_dtor != NULL) { skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private); } ASSERT(sg->sg_state == SKSEG_STATE_MAPPED || sg->sg_state == SKSEG_STATE_MAPPED_WIRED); IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md); VERIFY(sg->sg_md == md); /* if persistent, unwire this memory now */ if (skr->skr_mode & SKR_MODE_PERSISTENT) { err = IOSKMemoryUnwire(md); if (err != kIOReturnSuccess) { panic("Fail to unwire md %p, err %d", md, err); } } /* mark memory as empty/discarded for consistency */ err = IOSKMemoryDiscard(md); if (err != kIOReturnSuccess) { panic("Fail to discard md %p, err %d", md, err); } IOSKMemoryDestroy(md); sg->sg_md = NULL; sg->sg_start = sg->sg_end = 0; sg->sg_state = SKSEG_STATE_DETACHED; ASSERT(skr->skr_memtotal >= skr->skr_seg_size); skr->skr_memtotal -= skr->skr_seg_size; } sg->sg_type = SKSEG_TYPE_FREE; ASSERT(sg->sg_link.tqe_next == NULL); ASSERT(sg->sg_link.tqe_prev == NULL); TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link); ASSERT(sg->sg_node.rbe_left == NULL); ASSERT(sg->sg_node.rbe_right == NULL); ASSERT(sg->sg_node.rbe_parent == NULL); RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg); ++skr->skr_seg_free_cnt; ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt); } /* * Remove a segment from the freelist (allocating the segment). */ static struct sksegment * sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg, uint32_t skmflag, boolean_t purging) { #pragma unused(skmflag) mach_vm_address_t segstart; IOReturn err; SKR_LOCK_ASSERT_HELD(skr); ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); ASSERT(sg != NULL); ASSERT(skr == sg->sg_region); ASSERT(skr->skr_reg != NULL); ASSERT(sg->sg_type == SKSEG_TYPE_FREE); ASSERT(sg->sg_index < skr->skr_seg_max_cnt); #if (DEVELOPMENT || DEBUG) uint64_t mtbf = skmem_region_get_mtbf(); /* * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert. */ if (__improbable(mtbf != 0 && !purging && (net_uptime_ms() % mtbf) == 0 && !(skmflag & SKMEM_PANIC))) { SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure", skr->skr_name, SK_KVA(skr), SK_KVA(sg)); net_update_uptime(); return NULL; } #endif /* (DEVELOPMENT || DEBUG) */ TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link); sg->sg_link.tqe_next = NULL; sg->sg_link.tqe_prev = NULL; RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg); sg->sg_node.rbe_left = NULL; sg->sg_node.rbe_right = NULL; sg->sg_node.rbe_parent = NULL; ASSERT(skr->skr_seg_free_cnt != 0); --skr->skr_seg_free_cnt; /* * If the region is being depopulated, then we're done. */ if (__improbable(purging)) { ASSERT(sg->sg_md == NULL); ASSERT(sg->sg_start == 0 && sg->sg_end == 0); ASSERT(sg->sg_state == SKSEG_STATE_DETACHED); sg->sg_type = SKSEG_TYPE_DESTROYED; return sg; } ASSERT(sg->sg_md == NULL); ASSERT(sg->sg_start == 0 && sg->sg_end == 0); ASSERT(sg->sg_state == SKSEG_STATE_DETACHED); /* created as non-volatile (mapped) upon success */ if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size, &skr->skr_bufspec, &segstart)) == NULL) { ASSERT(sg->sg_type == SKSEG_TYPE_FREE); if (skmflag & SKMEM_PANIC) { /* if the caller insists for a success then panic */ panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable " "to satisfy mandatory allocation\n", skr->skr_name, skr, sg, sg->sg_index); /* NOTREACHED */ __builtin_unreachable(); } /* reinsert this segment to freelist */ ASSERT(sg->sg_link.tqe_next == NULL); ASSERT(sg->sg_link.tqe_prev == NULL); TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link); ASSERT(sg->sg_node.rbe_left == NULL); ASSERT(sg->sg_node.rbe_right == NULL); ASSERT(sg->sg_node.rbe_parent == NULL); RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg); ++skr->skr_seg_free_cnt; return NULL; } sg->sg_start = segstart; sg->sg_end = (segstart + skr->skr_seg_size); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); /* mark memory as non-volatile just to be consistent */ err = IOSKMemoryReclaim(sg->sg_md); if (err != kIOReturnSuccess) { panic("Fail to reclaim md %p, err %d", sg->sg_md, err); } /* if persistent, wire down its memory now */ if (skr->skr_mode & SKR_MODE_PERSISTENT) { err = IOSKMemoryWire(sg->sg_md); if (err != kIOReturnSuccess) { panic("Fail to wire md %p, err %d", sg->sg_md, err); } } err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md); if (err != kIOReturnSuccess) { panic("Fail to set md %p, err %d", sg->sg_md, err); } /* * Let the client wire it and insert to IOMMU, if applicable. * Try to find out if it's wired and set the right state. */ if (skr->skr_seg_ctor != NULL) { skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private); } sg->sg_state = IOSKBufferIsWired(sg->sg_md) ? SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED; skr->skr_memtotal += skr->skr_seg_size; ASSERT(sg->sg_md != NULL); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); sg->sg_type = SKSEG_TYPE_ALLOC; return sg; } /* * Find the first available index and allocate a segment at that index. */ static struct sksegment * sksegment_freelist_grow(struct skmem_region *skr) { struct sksegment *sg = NULL; uint32_t i, j, idx; SKR_LOCK_ASSERT_HELD(skr); ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO)); ASSERT(skr->skr_seg_bmap_len != 0); ASSERT(skr->skr_seg_max_cnt != 0); for (i = 0; i < skr->skr_seg_bmap_len; i++) { bitmap_t *bmap, mask; uint32_t end = (BMAPSZ - 1); if (i == (skr->skr_seg_bmap_len - 1)) { end = (skr->skr_seg_max_cnt - 1) % BMAPSZ; } bmap = &skr->skr_seg_bmap[i]; mask = BMASK64(0, end); j = ffsll((*bmap) & mask); if (j == 0) { continue; } --j; idx = (i * BMAPSZ) + j; sg = sksegment_alloc_with_idx(skr, idx); /* we're done */ break; } ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt)); return sg; } /* * Create a single segment at a specific index and add it to the freelist. */ static struct sksegment * sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx) { struct sksegment *sg; SKR_LOCK_ASSERT_HELD(skr); if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) { panic("%s: '%s' (%p) idx %u (out of %u) is already allocated", __func__, skr->skr_name, (void *)skr, idx, (skr->skr_seg_max_cnt - 1)); /* NOTREACHED */ __builtin_unreachable(); } /* must not fail, blocking alloc */ sg = sksegment_create(skr, idx); VERIFY(sg != NULL); VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)); /* populate the freelist */ sksegment_freelist_insert(skr, sg, TRUE); ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead)); #if (DEVELOPMENT || DEBUG) struct sksegment sg_key = { .sg_index = sg->sg_index }; ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key)); #endif /* (DEVELOPMENT || DEBUG) */ SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt); return sg; } /* * Rescale the regions's allocated-address hash table. */ static void skmem_region_hash_rescale(struct skmem_region *skr) { struct sksegment_bkt *old_table, *new_table; size_t old_size, new_size; uint32_t i, moved = 0; if (skr->skr_mode & SKR_MODE_PSEUDO) { ASSERT(skr->skr_hash_table == NULL); /* this is no-op for pseudo region */ return; } ASSERT(skr->skr_hash_table != NULL); /* insist that we are executing in the update thread call context */ ASSERT(sk_is_region_update_protected()); /* * To get small average lookup time (lookup depth near 1.0), the hash * table size should be roughly the same (not necessarily equivalent) * as the region size. */ new_size = MAX(skr->skr_hash_initial, (1 << (flsll(3 * skr->skr_seginuse + 4) - 2))); new_size = MIN(skr->skr_hash_limit, new_size); old_size = (skr->skr_hash_mask + 1); if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) { return; } new_table = sk_alloc_type_array(struct sksegment_bkt, new_size, Z_NOWAIT, skmem_tag_segment_hash); if (__improbable(new_table == NULL)) { return; } for (i = 0; i < new_size; i++) { TAILQ_INIT(&new_table[i].sgb_head); } SKR_LOCK(skr); old_size = (skr->skr_hash_mask + 1); old_table = skr->skr_hash_table; skr->skr_hash_mask = (uint32_t)(new_size - 1); skr->skr_hash_table = new_table; skr->skr_rescale++; for (i = 0; i < old_size; i++) { struct sksegment_bkt *sgb = &old_table[i]; struct sksegment_bkt *new_sgb; struct sksegment *sg; while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) { TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start); TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link); ++moved; } ASSERT(TAILQ_EMPTY(&sgb->sgb_head)); } SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr), (uint32_t)old_size, (uint32_t)new_size, moved); SKR_UNLOCK(skr); sk_free_type_array(struct sksegment_bkt, old_size, old_table); } /* * Apply a function to operate on all regions. */ static void skmem_region_applyall(void (*func)(struct skmem_region *)) { struct skmem_region *skr; net_update_uptime(); SKMEM_REGION_LOCK(); TAILQ_FOREACH(skr, &skmem_region_head, skr_link) { func(skr); } SKMEM_REGION_UNLOCK(); } static void skmem_region_update(struct skmem_region *skr) { SKMEM_REGION_LOCK_ASSERT_HELD(); /* insist that we are executing in the update thread call context */ ASSERT(sk_is_region_update_protected()); SKR_LOCK(skr); /* * If there are threads blocked waiting for an available * segment, wake them up periodically so they can issue * another skmem_cache_reap() to reclaim resources cached * by skmem_cache. */ if (skr->skr_seg_waiters != 0) { SK_DF(SK_VERB_MEM_REGION, "waking up %u waiters to reclaim", skr->skr_seg_waiters); skr->skr_seg_waiters = 0; wakeup(&skr->skr_seg_free); } SKR_UNLOCK(skr); /* * Rescale the hash table if needed. */ skmem_region_hash_rescale(skr); } /* * Thread call callback for update. */ static void skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg) { #pragma unused(dummy, arg) sk_protect_t protect; protect = sk_region_update_protect(); skmem_region_applyall(skmem_region_update); sk_region_update_unprotect(protect); skmem_dispatch(skmem_region_update_tc, NULL, (skmem_region_update_interval * NSEC_PER_SEC)); } boolean_t skmem_region_for_pp(skmem_region_id_t id) { int i; for (i = 0; i < SKMEM_PP_REGIONS; i++) { if (id == skmem_pp_region_ids[i]) { return TRUE; } } return FALSE; } void skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg) { bzero(sreg, sizeof(*sreg)); (void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name), "%s", skr->skr_name); uuid_copy(sreg->sreg_uuid, skr->skr_uuid); sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id; sreg->sreg_mode = skr->skr_mode; sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size; sreg->sreg_c_seg_size = skr->skr_seg_size; sreg->sreg_seg_cnt = skr->skr_seg_max_cnt; sreg->sreg_seg_objs = skr->skr_seg_objs; sreg->sreg_r_obj_size = skr->skr_r_obj_size; sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt; sreg->sreg_c_obj_size = skr->skr_c_obj_size; sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt; sreg->sreg_align = skr->skr_align; sreg->sreg_max_frags = skr->skr_max_frags; sreg->sreg_meminuse = skr->skr_meminuse; sreg->sreg_w_meminuse = skr->skr_w_meminuse; sreg->sreg_memtotal = skr->skr_memtotal; sreg->sreg_seginuse = skr->skr_seginuse; sreg->sreg_rescale = skr->skr_rescale; sreg->sreg_hash_size = (skr->skr_hash_mask + 1); sreg->sreg_alloc = skr->skr_alloc; sreg->sreg_free = skr->skr_free; } static size_t skmem_region_mib_get_stats(struct skmem_region *skr, void *out, size_t len) { size_t actual_space = sizeof(struct sk_stats_region); struct sk_stats_region *sreg = out; if (out == NULL || len < actual_space) { goto done; } skmem_region_get_stats(skr, sreg); done: return actual_space; } static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2, oidp) struct skmem_region *skr; size_t actual_space; size_t buffer_space; size_t allocated_space; caddr_t buffer = NULL; caddr_t scan; int error = 0; if (!kauth_cred_issuser(kauth_cred_get())) { return EPERM; } net_update_uptime(); buffer_space = req->oldlen; if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) { if (buffer_space > SK_SYSCTL_ALLOC_MAX) { buffer_space = SK_SYSCTL_ALLOC_MAX; } allocated_space = buffer_space; buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_region_mib); if (__improbable(buffer == NULL)) { return ENOBUFS; } } else if (req->oldptr == USER_ADDR_NULL) { buffer_space = 0; } actual_space = 0; scan = buffer; SKMEM_REGION_LOCK(); TAILQ_FOREACH(skr, &skmem_region_head, skr_link) { size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space); if (scan != NULL) { if (buffer_space < size) { /* supplied buffer too small, stop copying */ error = ENOMEM; break; } scan += size; buffer_space -= size; } actual_space += size; } SKMEM_REGION_UNLOCK(); if (actual_space != 0) { int out_error = SYSCTL_OUT(req, buffer, actual_space); if (out_error != 0) { error = out_error; } } if (buffer != NULL) { sk_free_data(buffer, allocated_space); } return error; } #if SK_LOG const char * skmem_region_id2name(skmem_region_id_t id) { const char *name; switch (id) { case SKMEM_REGION_SCHEMA: name = "SCHEMA"; break; case SKMEM_REGION_RING: name = "RING"; break; case SKMEM_REGION_BUF_DEF: name = "BUF_DEF"; break; case SKMEM_REGION_BUF_LARGE: name = "BUF_LARGE"; break; case SKMEM_REGION_RXBUF_DEF: name = "RXBUF_DEF"; break; case SKMEM_REGION_RXBUF_LARGE: name = "RXBUF_LARGE"; break; case SKMEM_REGION_TXBUF_DEF: name = "TXBUF_DEF"; break; case SKMEM_REGION_TXBUF_LARGE: name = "TXBUF_LARGE"; break; case SKMEM_REGION_UMD: name = "UMD"; break; case SKMEM_REGION_TXAUSD: name = "TXAUSD"; break; case SKMEM_REGION_RXFUSD: name = "RXFUSD"; break; case SKMEM_REGION_USTATS: name = "USTATS"; break; case SKMEM_REGION_FLOWADV: name = "FLOWADV"; break; case SKMEM_REGION_NEXUSADV: name = "NEXUSADV"; break; case SKMEM_REGION_SYSCTLS: name = "SYSCTLS"; break; case SKMEM_REGION_GUARD_HEAD: name = "HEADGUARD"; break; case SKMEM_REGION_GUARD_TAIL: name = "TAILGUARD"; break; case SKMEM_REGION_KMD: name = "KMD"; break; case SKMEM_REGION_RXKMD: name = "RXKMD"; break; case SKMEM_REGION_TXKMD: name = "TXKMD"; break; case SKMEM_REGION_TXAKSD: name = "TXAKSD"; break; case SKMEM_REGION_RXFKSD: name = "RXFKSD"; break; case SKMEM_REGION_KSTATS: name = "KSTATS"; break; case SKMEM_REGION_KBFT: name = "KBFT"; break; case SKMEM_REGION_UBFT: name = "UBFT"; break; case SKMEM_REGION_RXKBFT: name = "RXKBFT"; break; case SKMEM_REGION_TXKBFT: name = "TXKBFT"; break; case SKMEM_REGION_INTRINSIC: name = "INTRINSIC"; break; default: name = "UNKNOWN"; break; } return name; } #endif /* SK_LOG */ #if (DEVELOPMENT || DEBUG) uint64_t skmem_region_get_mtbf(void) { return skmem_region_mtbf; } void skmem_region_set_mtbf(uint64_t newval) { if (newval < SKMEM_REGION_MTBF_MIN) { if (newval != 0) { newval = SKMEM_REGION_MTBF_MIN; } } else if (newval > SKMEM_REGION_MTBF_MAX) { newval = SKMEM_REGION_MTBF_MAX; } if (skmem_region_mtbf != newval) { os_atomic_store(&skmem_region_mtbf, newval, release); SK_ERR("MTBF set to %llu msec", skmem_region_mtbf); } } static int skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { #pragma unused(oidp, arg1, arg2) int changed, error; uint64_t newval; _CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t)); if ((error = sysctl_io_number(req, skmem_region_mtbf, sizeof(uint64_t), &newval, &changed)) == 0) { if (changed) { skmem_region_set_mtbf(newval); } } return error; } #endif /* (DEVELOPMENT || DEBUG) */