/* * Copyright (c) 1998-2022 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce * support for mandatory and extensible security protections. This notice * is included in support of clause 2.2 (b) of the Apple Public License, * Version 2.0. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if CONFIG_MACF #include #endif /* MAC */ #if MULTIPATH #include #include #endif /* MULTIPATH */ #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1))) #if DEBUG || DEVELOPMENT #define DEBUG_KERNEL_ADDRPERM(_v) (_v) #else #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v) #endif /* TODO: this should be in a header file somewhere */ extern char *proc_name_address(void *p); static u_int32_t so_cache_hw; /* High water mark for socache */ static u_int32_t so_cache_timeouts; /* number of timeouts */ static u_int32_t so_cache_max_freed; /* max freed per timeout */ static u_int32_t cached_sock_count = 0; STAILQ_HEAD(, socket) so_cache_head; int max_cached_sock_count = MAX_CACHED_SOCKETS; static uint64_t so_cache_time; static int socketinit_done; static struct zone *so_cache_zone; static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache"); static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp); #include static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev); static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev); static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev); static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev); static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev); static void filt_sockdetach(struct knote *kn); static int filt_sockev(struct knote *kn, long hint); static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev); static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev); static int sooptcopyin_timeval(struct sockopt *, struct timeval *); static int sooptcopyout_timeval(struct sockopt *, const struct timeval *); SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = { .f_isfd = 1, .f_attach = filt_sorattach, .f_detach = filt_sordetach, .f_event = filt_soread, .f_touch = filt_sortouch, .f_process = filt_sorprocess, }; SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = { .f_isfd = 1, .f_attach = filt_sowattach, .f_detach = filt_sowdetach, .f_event = filt_sowrite, .f_touch = filt_sowtouch, .f_process = filt_sowprocess, }; SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = { .f_isfd = 1, .f_attach = filt_sockattach, .f_detach = filt_sockdetach, .f_event = filt_sockev, .f_touch = filt_socktouch, .f_process = filt_sockprocess, }; SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = { .f_isfd = 1, .f_attach = filt_sorattach, .f_detach = filt_sordetach, .f_event = filt_soread, .f_touch = filt_sortouch, .f_process = filt_sorprocess, }; SYSCTL_DECL(_kern_ipc); #define EVEN_MORE_LOCKING_DEBUG 0 int socket_debug = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, ""); #if (DEBUG || DEVELOPMENT) #define DEFAULT_SOSEND_ASSERT_PANIC 1 #else #define DEFAULT_SOSEND_ASSERT_PANIC 0 #endif /* (DEBUG || DEVELOPMENT) */ int sosend_assert_panic = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic, CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, ""); static unsigned long sodefunct_calls = 0; SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED, &sodefunct_calls, ""); ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM); so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0) #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2) #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1) #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3) #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1) #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3) #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8)) #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3) #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8)) #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES) int somaxconn = SOMAXCONN; SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); /* Should we get a maximum also ??? */ static int sosendmaxchain = 65536; static int sosendminchain = 16384; static int sorecvmincopy = 16384; SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); /* * Set to enable jumbo clusters (if available) for large writes when * the socket is marked with SOF_MULTIPAGES; see below. */ int sosendjcl = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); /* * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large * writes on the socket for all protocols on any network interfaces, * depending upon sosendjcl above. Be extra careful when setting this * to 1, because sending down packets that cross physical pages down to * broken drivers (those that falsely assume that the physical pages * are contiguous) might lead to system panics or silent data corruption. * When set to 0, the system will respect SOF_MULTIPAGES, which is set * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES * capable. Set this to 1 only for testing/debugging purposes. */ int sosendjcl_ignore_capab = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); /* * Set this to ignore SOF1_IF_2KCL and use big clusters for large * writes on the socket for all protocols on any network interfaces. * Be extra careful when setting this to 1, because sending down packets with * clusters larger that 2 KB might lead to system panics or data corruption. * When set to 0, the system will respect SOF1_IF_2KCL, which is set * on the outgoing interface * Set this to 1 for testing/debugging purposes only. */ int sosendbigcl_ignore_capab = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, ""); int sodefunctlog = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, &sodefunctlog, 0, ""); int sothrottlelog = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED, &sothrottlelog, 0, ""); int sorestrictrecv = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED, &sorestrictrecv, 0, "Enable inbound interface restrictions"); int sorestrictsend = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED, &sorestrictsend, 0, "Enable outbound interface restrictions"); int soreserveheadroom = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED, &soreserveheadroom, 0, "To allocate contiguous datagram buffers"); #if (DEBUG || DEVELOPMENT) int so_notsent_lowat_check = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED, &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check"); #endif /* DEBUG || DEVELOPMENT */ int so_accept_list_waits = 0; #if (DEBUG || DEVELOPMENT) SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED, &so_accept_list_waits, 0, "number of waits for listener incomp list"); #endif /* DEBUG || DEVELOPMENT */ extern struct inpcbinfo tcbinfo; /* TODO: these should be in header file */ extern int get_inpcb_str_size(void); extern int get_tcp_str_size(void); vm_size_t so_cache_zone_element_size; static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *); static void cached_sock_alloc(struct socket **, zalloc_flags_t); static void cached_sock_free(struct socket *); /* * Maximum of extended background idle sockets per process * Set to zero to disable further setting of the option */ #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1 #define SO_IDLE_BK_IDLE_TIME 600 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072 struct soextbkidlestat soextbkidlestat; SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc, CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0, "Maximum of extended background idle sockets per process"); SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_time, 0, "Time in seconds to keep extended background idle sockets"); SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_rcvhiwat, 0, "High water mark for extended background idle sockets"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED, &soextbkidlestat, soextbkidlestat, ""); int so_set_extended_bk_idle(struct socket *, int); #define SO_MAX_MSG_X 1024 /* * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from * setting the DSCP code on the packet based on the service class; see * for details. */ __private_extern__ u_int32_t sotcdb = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, &sotcdb, 0, ""); void socketinit(void) { _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t)); VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t))); #ifdef __LP64__ _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints)); _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif)); _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr)); _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen)); _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr)); _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen)); #else _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints)); _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif)); _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr)); _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen)); _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr)); _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen)); #endif if (socketinit_done) { printf("socketinit: already called...\n"); return; } socketinit_done = 1; PE_parse_boot_argn("socket_debug", &socket_debug, sizeof(socket_debug)); PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic, sizeof(sosend_assert_panic)); STAILQ_INIT(&so_cache_head); so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4 + get_inpcb_str_size() + 4 + get_tcp_str_size()); so_cache_zone = zone_create("socache zone", so_cache_zone_element_size, ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM); bzero(&soextbkidlestat, sizeof(struct soextbkidlestat)); soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC; soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME; soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT; in_pcbinit(); } static void cached_sock_alloc(struct socket **so, zalloc_flags_t how) { caddr_t temp; uintptr_t offset; lck_mtx_lock(&so_cache_mtx); if (!STAILQ_EMPTY(&so_cache_head)) { VERIFY(cached_sock_count > 0); *so = STAILQ_FIRST(&so_cache_head); STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); STAILQ_NEXT((*so), so_cache_ent) = NULL; cached_sock_count--; lck_mtx_unlock(&so_cache_mtx); temp = (*so)->so_saved_pcb; bzero((caddr_t)*so, sizeof(struct socket)); (*so)->so_saved_pcb = temp; } else { lck_mtx_unlock(&so_cache_mtx); *so = zalloc_flags(so_cache_zone, how | Z_ZERO); /* * Define offsets for extra structures into our * single block of memory. Align extra structures * on longword boundaries. */ offset = (uintptr_t)*so; offset += sizeof(struct socket); offset = ALIGN(offset); (*so)->so_saved_pcb = (caddr_t)offset; offset += get_inpcb_str_size(); offset = ALIGN(offset); ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t)offset; } OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1); } static void cached_sock_free(struct socket *so) { lck_mtx_lock(&so_cache_mtx); so_cache_time = net_uptime(); if (++cached_sock_count > max_cached_sock_count) { --cached_sock_count; lck_mtx_unlock(&so_cache_mtx); zfree(so_cache_zone, so); } else { if (so_cache_hw < cached_sock_count) { so_cache_hw = cached_sock_count; } STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent); so->cache_timestamp = so_cache_time; lck_mtx_unlock(&so_cache_mtx); } } void so_update_last_owner_locked(struct socket *so, proc_t self) { if (so->last_pid != 0) { /* * last_pid and last_upid should remain zero for sockets * created using sock_socket. The check above achieves that */ if (self == PROC_NULL) { self = current_proc(); } if (so->last_upid != proc_uniqueid(self) || so->last_pid != proc_pid(self)) { so->last_upid = proc_uniqueid(self); so->last_pid = proc_pid(self); proc_getexecutableuuid(self, so->last_uuid, sizeof(so->last_uuid)); if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { (*so->so_proto->pr_update_last_owner)(so, self, NULL); } } proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid)); } } void so_update_policy(struct socket *so) { if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { (void) inp_update_policy(sotoinpcb(so)); } } #if NECP static void so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr) { if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0); } } #endif /* NECP */ boolean_t so_cache_timer(void) { struct socket *p; int n_freed = 0; boolean_t rc = FALSE; lck_mtx_lock(&so_cache_mtx); so_cache_timeouts++; so_cache_time = net_uptime(); while (!STAILQ_EMPTY(&so_cache_head)) { VERIFY(cached_sock_count > 0); p = STAILQ_FIRST(&so_cache_head); if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT) { break; } STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); --cached_sock_count; zfree(so_cache_zone, p); if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) { so_cache_max_freed++; break; } } /* Schedule again if there is more to cleanup */ if (!STAILQ_EMPTY(&so_cache_head)) { rc = TRUE; } lck_mtx_unlock(&so_cache_mtx); return rc; } /* * Get a socket structure from our zone, and initialize it. * We don't implement `waitok' yet (see comments in uipc_domain.c). * Note that it would probably be better to allocate socket * and PCB at the same time, but I'm not convinced that all * the protocols can be easily modified to do this. */ struct socket * soalloc(int waitok, int dom, int type) { zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT; struct socket *so; if ((dom == PF_INET) && (type == SOCK_STREAM)) { cached_sock_alloc(&so, how); } else { so = zalloc_flags(socket_zone, how | Z_ZERO); } if (so != NULL) { so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); /* * Increment the socket allocation statistics */ INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total); } return so; } int socreate_internal(int dom, struct socket **aso, int type, int proto, struct proc *p, uint32_t flags, struct proc *ep) { struct protosw *prp; struct socket *so; int error = 0; pid_t rpid = -1; #if TCPDEBUG extern int tcpconsdebug; #endif VERIFY(aso != NULL); *aso = NULL; if (proto != 0) { prp = pffindproto(dom, proto, type); } else { prp = pffindtype(dom, type); } if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) { if (pffinddomain(dom) == NULL) { return EAFNOSUPPORT; } if (proto != 0) { if (pffindprotonotype(dom, proto) != NULL) { return EPROTOTYPE; } } return EPROTONOSUPPORT; } if (prp->pr_type != type) { return EPROTOTYPE; } so = soalloc(1, dom, type); if (so == NULL) { return ENOBUFS; } switch (dom) { case PF_LOCAL: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total); break; case PF_INET: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total); if (type == SOCK_STREAM) { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total); } else { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total); } break; case PF_ROUTE: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total); break; case PF_NDRV: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total); break; case PF_KEY: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total); break; case PF_INET6: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total); if (type == SOCK_STREAM) { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total); } else { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total); } break; case PF_SYSTEM: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total); break; case PF_MULTIPATH: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total); break; default: INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total); break; } if (flags & SOCF_MPTCP) { so->so_state |= SS_NBIO; } TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = (short)type; so->so_family = prp->pr_domain->dom_family; so->so_protocol = prp->pr_protocol; so->last_upid = proc_uniqueid(p); so->last_pid = proc_pid(p); proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid)); proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid)); so->so_rpid = -1; uuid_clear(so->so_ruuid); if (ep != PROC_NULL && ep != p) { so->e_upid = proc_uniqueid(ep); so->e_pid = proc_pid(ep); proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid)); so->so_flags |= SOF_DELEGATED; if (ep->p_responsible_pid != so->e_pid) { rpid = ep->p_responsible_pid; so->so_rpid = rpid; proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid)); } } if (rpid < 0 && p->p_responsible_pid != so->last_pid) { rpid = p->p_responsible_pid; so->so_rpid = rpid; proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid)); } so->so_cred = kauth_cred_proc_ref(p); if (!suser(kauth_cred_get(), NULL)) { so->so_state |= SS_PRIV; } so->so_persona_id = current_persona_get_id(); so->so_proto = prp; so->so_rcv.sb_flags |= SB_RECV; so->so_rcv.sb_so = so->so_snd.sb_so = so; so->next_lock_lr = 0; so->next_unlock_lr = 0; /* * Attachment will create the per pcb lock if necessary and * increase refcount for creation, make sure it's done before * socket is inserted in lists. */ so->so_usecount++; error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); if (error != 0) { /* * Warning: * If so_pcb is not zero, the socket will be leaked, * so protocol attachment handler must be coded carefuly */ if (so->so_pcb != NULL) { os_log_error(OS_LOG_DEFAULT, "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d", error, dom, proto, type); } /* * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket */ so->so_state |= SS_NOFDREF; so->so_flags |= SOF_PCBCLEARING; VERIFY(so->so_usecount > 0); so->so_usecount--; sofreelastref(so, 1); /* will deallocate the socket */ return error; } /* * Note: needs so_pcb to be set after pru_attach */ if (prp->pr_update_last_owner != NULL) { (*prp->pr_update_last_owner)(so, p, ep); } os_atomic_inc(&prp->pr_domain->dom_refs, relaxed); /* Attach socket filters for this protocol */ sflt_initsock(so); #if TCPDEBUG if (tcpconsdebug == 2) { so->so_options |= SO_DEBUG; } #endif so_set_default_traffic_class(so); /* * If this thread or task is marked to create backgrounded sockets, * mark the socket as background. */ if (!(flags & SOCF_MPTCP) && proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) { socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = current_thread(); } switch (dom) { /* * Don't mark Unix domain or system * eligible for defunct by default. */ case PF_LOCAL: case PF_SYSTEM: so->so_flags |= SOF_NODEFUNCT; break; default: break; } /* * Entitlements can't be checked at socket creation time except if the * application requested a feature guarded by a privilege (c.f., socket * delegation). * The priv(9) and the Sandboxing APIs are designed with the idea that * a privilege check should only be triggered by a userland request. * A privilege check at socket creation time is time consuming and * could trigger many authorisation error messages from the security * APIs. */ *aso = so; return 0; } /* * Returns: 0 Success * EAFNOSUPPORT * EPROTOTYPE * EPROTONOSUPPORT * ENOBUFS * :ENOBUFS[AF_UNIX] * :ENOBUFS[TCP] * :ENOMEM[TCP] * :??? [other protocol families, IPSEC] */ int socreate(int dom, struct socket **aso, int type, int proto) { return socreate_internal(dom, aso, type, proto, current_proc(), 0, PROC_NULL); } int socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid) { int error = 0; struct proc *ep = PROC_NULL; if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) { error = ESRCH; goto done; } error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep); /* * It might not be wise to hold the proc reference when calling * socreate_internal since it calls soalloc with M_WAITOK */ done: if (ep != PROC_NULL) { proc_rele(ep); } return error; } /* * Returns: 0 Success * :EINVAL Invalid argument [COMMON_START] * :EAFNOSUPPORT Address family not supported * :EADDRNOTAVAIL Address not available. * :EINVAL Invalid argument * :EAFNOSUPPORT Address family not supported [notdef] * :EACCES Permission denied * :EADDRINUSE Address in use * :EAGAIN Resource unavailable, try again * :EPERM Operation not permitted * :??? * :??? * * Notes: It's not possible to fully enumerate the return codes above, * since socket filter authors and protocol family authors may * not choose to limit their error returns to those listed, even * though this may result in some software operating incorrectly. * * The error codes which are enumerated above are those known to * be returned by the tcp_usr_bind function supplied. */ int sobindlock(struct socket *so, struct sockaddr *nam, int dolock) { struct proc *p = current_proc(); int error = 0; if (dolock) { socket_lock(so, 1); } so_update_last_owner_locked(so, p); so_update_policy(so); #if NECP so_update_necp_policy(so, nam, NULL); #endif /* NECP */ /* * If this is a bind request on a socket that has been marked * as inactive, reject it now before we go any further. */ if (so->so_flags & SOF_DEFUNCT) { error = EINVAL; SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); goto out; } /* Socket filter */ error = sflt_bind(so, nam); if (error == 0) { error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); } out: if (dolock) { socket_unlock(so, 1); } if (error == EJUSTRETURN) { error = 0; } return error; } void sodealloc(struct socket *so) { kauth_cred_unref(&so->so_cred); /* Remove any filters */ sflt_termsock(so); so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) { cached_sock_free(so); } else { zfree(socket_zone, so); } } /* * Returns: 0 Success * EINVAL * EOPNOTSUPP * :EINVAL[AF_UNIX] * :EINVAL[TCP] * :EADDRNOTAVAIL[TCP] Address not available. * :EINVAL[TCP] Invalid argument * :EAFNOSUPPORT[TCP] Address family not supported [notdef] * :EACCES[TCP] Permission denied * :EADDRINUSE[TCP] Address in use * :EAGAIN[TCP] Resource unavailable, try again * :EPERM[TCP] Operation not permitted * :??? * * Notes: Other returns depend on the protocol family; all * returns depend on what the filter author causes * their filter to return. */ int solisten(struct socket *so, int backlog) { struct proc *p = current_proc(); int error = 0; socket_lock(so, 1); so_update_last_owner_locked(so, p); so_update_policy(so); if (TAILQ_EMPTY(&so->so_comp)) { so->so_options |= SO_ACCEPTCONN; } #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ if (so->so_proto == NULL) { error = EINVAL; so->so_options &= ~SO_ACCEPTCONN; goto out; } if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { error = EOPNOTSUPP; so->so_options &= ~SO_ACCEPTCONN; goto out; } /* * If the listen request is made on a socket that is not fully * disconnected, or on a socket that has been marked as inactive, * reject the request now. */ if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) || (so->so_flags & SOF_DEFUNCT)) { error = EINVAL; if (so->so_flags & SOF_DEFUNCT) { SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] " "(%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); } so->so_options &= ~SO_ACCEPTCONN; goto out; } if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) { error = EPERM; so->so_options &= ~SO_ACCEPTCONN; goto out; } error = sflt_listen(so); if (error == 0) { error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); } if (error) { if (error == EJUSTRETURN) { error = 0; } so->so_options &= ~SO_ACCEPTCONN; goto out; } /* * POSIX: The implementation may have an upper limit on the length of * the listen queue-either global or per accepting socket. If backlog * exceeds this limit, the length of the listen queue is set to the * limit. * * If listen() is called with a backlog argument value that is less * than 0, the function behaves as if it had been called with a backlog * argument value of 0. * * A backlog argument of 0 may allow the socket to accept connections, * in which case the length of the listen queue may be set to an * implementation-defined minimum value. */ if (backlog <= 0 || backlog > somaxconn) { backlog = somaxconn; } so->so_qlimit = (short)backlog; out: socket_unlock(so, 1); return error; } /* * The "accept list lock" protects the fields related to the listener queues * because we can unlock a socket to respect the lock ordering between * the listener socket and its clients sockets. The lock ordering is first to * acquire the client socket before the listener socket. * * The accept list lock serializes access to the following fields: * - of the listener socket: * - so_comp * - so_incomp * - so_qlen * - so_inqlen * - of client sockets that are in so_comp or so_incomp: * - so_head * - so_list * * As one can see the accept list lock protects the consistent of the * linkage of the client sockets. * * Note that those fields may be read without holding the accept list lock * for a preflight provided the accept list lock is taken when committing * to take an action based on the result of the preflight. The preflight * saves the cost of doing the unlock/lock dance. */ void so_acquire_accept_list(struct socket *head, struct socket *so) { lck_mtx_t *mutex_held; if (head->so_proto->pr_getlock == NULL) { return; } mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK); LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) { head->so_flags1 |= SOF1_ACCEPT_LIST_HELD; return; } if (so != NULL) { socket_unlock(so, 0); } while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) { so_accept_list_waits += 1; msleep((caddr_t)&head->so_incomp, mutex_held, PSOCK | PCATCH, __func__, NULL); } head->so_flags1 |= SOF1_ACCEPT_LIST_HELD; if (so != NULL) { socket_unlock(head, 0); socket_lock(so, 0); socket_lock(head, 0); } } void so_release_accept_list(struct socket *head) { if (head->so_proto->pr_getlock != NULL) { lck_mtx_t *mutex_held; mutex_held = (*head->so_proto->pr_getlock)(head, 0); LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD; wakeup((caddr_t)&head->so_incomp); } } void sofreelastref(struct socket *so, int dealloc) { struct socket *head = so->so_head; /* Assume socket is locked */ #if FLOW_DIVERT if (so->so_flags & SOF_FLOW_DIVERT) { flow_divert_detach(so); } #endif /* FLOW_DIVERT */ #if CONTENT_FILTER if ((so->so_flags & SOF_CONTENT_FILTER) != 0) { cfil_sock_detach(so); } #endif /* CONTENT_FILTER */ if (NEED_DGRAM_FLOW_TRACKING(so)) { soflow_detach(so); } if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) { selthreadclear(&so->so_snd.sb_sel); selthreadclear(&so->so_rcv.sb_sel); so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL); so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL); so->so_event = sonullevent; return; } if (head != NULL) { /* * Need to lock the listener when the protocol has * per socket locks */ if (head->so_proto->pr_getlock != NULL) { socket_lock(head, 1); so_acquire_accept_list(head, so); } if (so->so_state & SS_INCOMP) { so->so_state &= ~SS_INCOMP; TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; head->so_qlen--; so->so_head = NULL; if (head->so_proto->pr_getlock != NULL) { so_release_accept_list(head); socket_unlock(head, 1); } } else if (so->so_state & SS_COMP) { if (head->so_proto->pr_getlock != NULL) { so_release_accept_list(head); socket_unlock(head, 1); } /* * We must not decommission a socket that's * on the accept(2) queue. If we do, then * accept(2) may hang after select(2) indicated * that the listening socket was ready. */ selthreadclear(&so->so_snd.sb_sel); selthreadclear(&so->so_rcv.sb_sel); so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL); so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL); so->so_event = sonullevent; return; } else { if (head->so_proto->pr_getlock != NULL) { so_release_accept_list(head); socket_unlock(head, 1); } printf("sofree: not queued\n"); } } sowflush(so); sorflush(so); /* 3932268: disable upcall */ so->so_rcv.sb_flags &= ~SB_UPCALL; so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT); so->so_event = sonullevent; if (dealloc) { sodealloc(so); } } void soclose_wait_locked(struct socket *so) { lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); /* * Double check here and return if there's no outstanding upcall; * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. */ if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) { return; } so->so_rcv.sb_flags &= ~SB_UPCALL; so->so_snd.sb_flags &= ~SB_UPCALL; so->so_flags |= SOF_CLOSEWAIT; (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1), "soclose_wait_locked", NULL); LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); so->so_flags &= ~SOF_CLOSEWAIT; } /* * Close a socket on last file table reference removal. * Initiate disconnect if connected. * Free socket when disconnect complete. */ int soclose_locked(struct socket *so) { int error = 0; struct timespec ts; if (so->so_usecount == 0) { panic("soclose: so=%p refcount=0", so); /* NOTREACHED */ } sflt_notify(so, sock_evt_closing, NULL); if (so->so_upcallusecount) { soclose_wait_locked(so); } #if CONTENT_FILTER /* * We have to wait until the content filters are done */ if ((so->so_flags & SOF_CONTENT_FILTER) != 0) { cfil_sock_close_wait(so); cfil_sock_is_closed(so); cfil_sock_detach(so); } #endif /* CONTENT_FILTER */ if (NEED_DGRAM_FLOW_TRACKING(so)) { soflow_detach(so); } if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) { soresume(current_proc(), so, 1); so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED; } if ((so->so_options & SO_ACCEPTCONN)) { struct socket *sp, *sonext; int persocklock = 0; int incomp_overflow_only; /* * We do not want new connection to be added * to the connection queues */ so->so_options &= ~SO_ACCEPTCONN; /* * We can drop the lock on the listener once * we've acquired the incoming list */ if (so->so_proto->pr_getlock != NULL) { persocklock = 1; so_acquire_accept_list(so, NULL); socket_unlock(so, 0); } again: incomp_overflow_only = 1; TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) { /* * Radar 5350314 * skip sockets thrown away by tcpdropdropblreq * they will get cleanup by the garbage collection. * otherwise, remove the incomp socket from the queue * and let soabort trigger the appropriate cleanup. */ if (sp->so_flags & SOF_OVERFLOW) { continue; } if (persocklock != 0) { socket_lock(sp, 1); } /* * Radar 27945981 * The extra reference for the list insure the * validity of the socket pointer when we perform the * unlock of the head above */ if (sp->so_state & SS_INCOMP) { sp->so_state &= ~SS_INCOMP; sp->so_head = NULL; TAILQ_REMOVE(&so->so_incomp, sp, so_list); so->so_incqlen--; so->so_qlen--; (void) soabort(sp); } else { panic("%s sp %p in so_incomp but !SS_INCOMP", __func__, sp); } if (persocklock != 0) { socket_unlock(sp, 1); } } TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) { /* Dequeue from so_comp since sofree() won't do it */ if (persocklock != 0) { socket_lock(sp, 1); } if (sp->so_state & SS_COMP) { sp->so_state &= ~SS_COMP; sp->so_head = NULL; TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; (void) soabort(sp); } else { panic("%s sp %p in so_comp but !SS_COMP", __func__, sp); } if (persocklock) { socket_unlock(sp, 1); } } if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) { #if (DEBUG | DEVELOPMENT) panic("%s head %p so_comp not empty", __func__, so); #endif /* (DEVELOPMENT || DEBUG) */ goto again; } if (!TAILQ_EMPTY(&so->so_comp)) { #if (DEBUG | DEVELOPMENT) panic("%s head %p so_comp not empty", __func__, so); #endif /* (DEVELOPMENT || DEBUG) */ goto again; } if (persocklock) { socket_lock(so, 0); so_release_accept_list(so); } } if (so->so_pcb == NULL) { /* 3915887: mark the socket as ready for dealloc */ so->so_flags |= SOF_PCBCLEARING; goto discard; } if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnectlocked(so); if (error) { goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) { goto drop; } while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) { lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } ts.tv_sec = (so->so_linger / 100); ts.tv_nsec = (so->so_linger % 100) * NSEC_PER_USEC * 1000 * 10; error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH, "soclose", &ts); if (error) { /* * It's OK when the time fires, * don't report an error */ if (error == EWOULDBLOCK) { error = 0; } break; } } } } drop: if (so->so_usecount == 0) { panic("soclose: usecount is zero so=%p", so); /* NOTREACHED */ } if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) { int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) { error = error2; } } if (so->so_usecount <= 0) { panic("soclose: usecount is zero so=%p", so); /* NOTREACHED */ } discard: if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF)) { panic("soclose: NOFDREF"); /* NOTREACHED */ } so->so_state |= SS_NOFDREF; if ((so->so_flags & SOF_KNOTE) != 0) { KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED); } os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed); VERIFY(so->so_usecount > 0); so->so_usecount--; sofree(so); return error; } int soclose(struct socket *so) { int error = 0; socket_lock(so, 1); if (so->so_retaincnt == 0) { error = soclose_locked(so); } else { /* * if the FD is going away, but socket is * retained in kernel remove its reference */ so->so_usecount--; if (so->so_usecount < 2) { panic("soclose: retaincnt non null and so=%p " "usecount=%d\n", so, so->so_usecount); } } socket_unlock(so, 1); return error; } /* * Must be called at splnet... */ /* Should already be locked */ int soabort(struct socket *so) { int error; #ifdef MORE_LOCKING_DEBUG lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, 0); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); #endif if ((so->so_flags & SOF_ABORTED) == 0) { so->so_flags |= SOF_ABORTED; error = (*so->so_proto->pr_usrreqs->pru_abort)(so); if (error) { sofree(so); return error; } } return 0; } int soacceptlock(struct socket *so, struct sockaddr **nam, int dolock) { int error; if (dolock) { socket_lock(so, 1); } so_update_last_owner_locked(so, PROC_NULL); so_update_policy(so); #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ if ((so->so_state & SS_NOFDREF) == 0) { panic("soaccept: !NOFDREF"); } so->so_state &= ~SS_NOFDREF; error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); if (dolock) { socket_unlock(so, 1); } return error; } int soaccept(struct socket *so, struct sockaddr **nam) { return soacceptlock(so, nam, 1); } int soacceptfilter(struct socket *so, struct socket *head) { struct sockaddr *local = NULL, *remote = NULL; int error = 0; /* * Hold the lock even if this socket has not been made visible * to the filter(s). For sockets with global locks, this protects * against the head or peer going away */ socket_lock(so, 1); if (sogetaddr_locked(so, &remote, 1) != 0 || sogetaddr_locked(so, &local, 0) != 0) { so->so_state &= ~SS_NOFDREF; socket_unlock(so, 1); soclose(so); /* Out of resources; try it again next time */ error = ECONNABORTED; goto done; } error = sflt_accept(head, so, local, remote); /* * If we get EJUSTRETURN from one of the filters, mark this socket * as inactive and return it anyway. This newly accepted socket * will be disconnected later before we hand it off to the caller. */ if (error == EJUSTRETURN) { error = 0; (void) sosetdefunct(current_proc(), so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); } if (error != 0) { /* * This may seem like a duplication to the above error * handling part when we return ECONNABORTED, except * the following is done while holding the lock since * the socket has been exposed to the filter(s) earlier. */ so->so_state &= ~SS_NOFDREF; socket_unlock(so, 1); soclose(so); /* Propagate socket filter's error code to the caller */ } else { socket_unlock(so, 1); } done: /* Callee checks for NULL pointer */ sock_freeaddr(remote); sock_freeaddr(local); return error; } /* * Returns: 0 Success * EOPNOTSUPP Operation not supported on socket * EISCONN Socket is connected * :EADDRNOTAVAIL Address not available. * :EINVAL Invalid argument * :EAFNOSUPPORT Address family not supported [notdef] * :EACCES Permission denied * :EADDRINUSE Address in use * :EAGAIN Resource unavailable, try again * :EPERM Operation not permitted * :??? [anything a filter writer might set] */ int soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) { int error; struct proc *p = current_proc(); tracker_metadata_t metadata = { }; if (dolock) { socket_lock(so, 1); } so_update_last_owner_locked(so, p); so_update_policy(so); /* * If this is a listening socket or if this is a previously-accepted * socket that has been marked as inactive, reject the connect request. */ if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { error = EOPNOTSUPP; if (so->so_flags & SOF_DEFUNCT) { SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] " "(%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); } if (dolock) { socket_unlock(so, 1); } return error; } if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) { if (dolock) { socket_unlock(so, 1); } return EPERM; } /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. * This allows user to disconnect by connecting to, e.g., * a null address. */ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnectlocked(so)))) { error = EISCONN; } else { /* * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker. */ if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) { if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &metadata) == 0) { if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) { so->so_flags1 |= SOF1_KNOWN_TRACKER; } if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) { so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN; } if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) { printf("connect() - failed necp_set_socket_domain_attributes"); } } } #if NECP /* Update NECP evaluation after setting any domain via the tracker checks */ so_update_necp_policy(so, NULL, nam); #endif /* NECP */ /* * Run connect filter before calling protocol: * - non-blocking connect returns before completion; */ error = sflt_connectout(so, nam); if (error != 0) { if (error == EJUSTRETURN) { error = 0; } } else { error = (*so->so_proto->pr_usrreqs->pru_connect) (so, nam, p); if (error != 0) { so->so_state &= ~SS_ISCONNECTING; } } } if (dolock) { socket_unlock(so, 1); } return error; } int soconnect(struct socket *so, struct sockaddr *nam) { return soconnectlock(so, nam, 1); } /* * Returns: 0 Success * :EINVAL[AF_UNIX] * :EPROTOTYPE[AF_UNIX] * :??? [other protocol families] * * Notes: is not supported by [TCP]. */ int soconnect2(struct socket *so1, struct socket *so2) { int error; socket_lock(so1, 1); if (so2->so_proto->pr_lock) { socket_lock(so2, 1); } error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); socket_unlock(so1, 1); if (so2->so_proto->pr_lock) { socket_unlock(so2, 1); } return error; } int soconnectxlocked(struct socket *so, struct sockaddr *src, struct sockaddr *dst, struct proc *p, uint32_t ifscope, sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg, uint32_t arglen, uio_t auio, user_ssize_t *bytes_written) { int error; tracker_metadata_t metadata = { }; so_update_last_owner_locked(so, p); so_update_policy(so); /* * If this is a listening socket or if this is a previously-accepted * socket that has been marked as inactive, reject the connect request. */ if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { error = EOPNOTSUPP; if (so->so_flags & SOF_DEFUNCT) { SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] " "(%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); } return error; } if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) { return EPERM; } /* * If protocol is connection-based, can only connect once * unless PR_MULTICONN is set. Otherwise, if connected, * try to disconnect first. This allows user to disconnect * by connecting to, e.g., a null address. */ if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) && !(so->so_proto->pr_flags & PR_MULTICONN) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnectlocked(so)) != 0)) { error = EISCONN; } else { /* * For TCP, check if destination address is a tracker and mark the socket accordingly * (only if it hasn't been marked yet). */ if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) && !(so->so_flags1 & SOF1_KNOWN_TRACKER)) { if (tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &metadata) == 0) { if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) { so->so_flags1 |= SOF1_KNOWN_TRACKER; } if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) { so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN; } if (necp_set_socket_domain_attributes(so, metadata.domain, metadata.domain_owner)) { printf("connectx() - failed necp_set_socket_domain_attributes"); } } } if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) && (flags & CONNECT_DATA_IDEMPOTENT)) { so->so_flags1 |= SOF1_DATA_IDEMPOTENT; if (flags & CONNECT_DATA_AUTHENTICATED) { so->so_flags1 |= SOF1_DATA_AUTHENTICATED; } } /* * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data. * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error) * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data * Case 3 allows user to combine write with connect even if they have * no use for TFO (such as regular TCP, and UDP). * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case) */ if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) && ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) { so->so_flags1 |= SOF1_PRECONNECT_DATA; } /* * If a user sets data idempotent and does not pass an uio, or * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset * SOF1_DATA_IDEMPOTENT. */ if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) && (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) { /* We should return EINVAL instead perhaps. */ so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT; } /* * Run connect filter before calling protocol: * - non-blocking connect returns before completion; */ error = sflt_connectout(so, dst); if (error != 0) { /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */ so->so_flags1 &= ~SOF1_PRECONNECT_DATA; if (error == EJUSTRETURN) { error = 0; } } else { error = (*so->so_proto->pr_usrreqs->pru_connectx) (so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, auio, bytes_written); if (error != 0) { so->so_state &= ~SS_ISCONNECTING; if (error != EINPROGRESS) { so->so_flags1 &= ~SOF1_PRECONNECT_DATA; } } } } return error; } int sodisconnectlocked(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto bad; } if (so->so_state & SS_ISDISCONNECTING) { error = EALREADY; goto bad; } error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); if (error == 0) { sflt_notify(so, sock_evt_disconnected, NULL); } bad: return error; } /* Locking version */ int sodisconnect(struct socket *so) { int error; socket_lock(so, 1); error = sodisconnectlocked(so); socket_unlock(so, 1); return error; } int sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid) { int error; /* * Call the protocol disconnectx handler; let it handle all * matters related to the connection state of this session. */ error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid); if (error == 0) { /* * The event applies only for the session, not for * the disconnection of individual subflows. */ if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) { sflt_notify(so, sock_evt_disconnected, NULL); } } return error; } int sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid) { int error; socket_lock(so, 1); error = sodisconnectxlocked(so, aid, cid); socket_unlock(so, 1); return error; } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) /* * sosendcheck will lock the socket buffer if it isn't locked and * verify that there is space for the data being inserted. * * Returns: 0 Success * EPIPE * sblock:EWOULDBLOCK * sblock:EINTR * sbwait:EBADF * sbwait:EINTR * [so_error]:??? */ int sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid, int32_t clen, int32_t atomic, int flags, int *sblocked) { int error = 0; int32_t space; int assumelock = 0; restart: if (*sblocked == 0) { if ((so->so_snd.sb_flags & SB_LOCK) != 0 && so->so_send_filt_thread != 0 && so->so_send_filt_thread == current_thread()) { /* * We're being called recursively from a filter, * allow this to continue. Radar 4150520. * Don't set sblocked because we don't want * to perform an unlock later. */ assumelock = 1; } else { error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) { if (so->so_flags & SOF_DEFUNCT) { goto defunct; } return error; } *sblocked = 1; } } /* * If a send attempt is made on a socket that has been marked * as inactive (disconnected), reject the request. */ if (so->so_flags & SOF_DEFUNCT) { defunct: error = EPIPE; SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); return error; } if (so->so_state & SS_CANTSENDMORE) { #if CONTENT_FILTER /* * Can re-inject data of half closed connections */ if ((so->so_state & SS_ISDISCONNECTED) == 0 && so->so_snd.sb_cfil_thread == current_thread() && cfil_sock_data_pending(&so->so_snd) != 0) { CFIL_LOG(LOG_INFO, "so %llx ignore SS_CANTSENDMORE", (uint64_t)DEBUG_KERNEL_ADDRPERM(so)); } else #endif /* CONTENT_FILTER */ return EPIPE; } if (so->so_error) { error = so->so_error; so->so_error = 0; return error; } if ((so->so_state & SS_ISCONNECTED) == 0) { if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { if (((so->so_state & SS_ISCONFIRMING) == 0) && (resid != 0 || clen == 0) && !(so->so_flags1 & SOF1_PRECONNECT_DATA)) { return ENOTCONN; } } else if (addr == 0) { return (so->so_proto->pr_flags & PR_CONNREQUIRED) ? ENOTCONN : EDESTADDRREQ; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) { space += 1024; } if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { return EMSGSIZE; } if ((space < resid + clen && (atomic || (space < (int32_t)so->so_snd.sb_lowat) || space < clen)) || (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) { /* * don't block the connectx call when there's more data * than can be copied. */ if (so->so_flags1 & SOF1_PRECONNECT_DATA) { if (space == 0) { return EWOULDBLOCK; } if (space < (int32_t)so->so_snd.sb_lowat) { return 0; } } if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || assumelock) { return EWOULDBLOCK; } sbunlock(&so->so_snd, TRUE); /* keep socket locked */ *sblocked = 0; error = sbwait(&so->so_snd); if (error) { if (so->so_flags & SOF_DEFUNCT) { goto defunct; } return error; } goto restart; } return 0; } /* * Send on a socket. * If send must go all at once and message is larger than * send buffering, then hard error. * Lock against other senders. * If must go all at once and not enough room now, then * inform user that this would block and do nothing. * Otherwise, if nonblocking, send as much as possible. * The data to be sent is described by "uio" if nonzero, * otherwise by the mbuf chain "top" (which must be null * if uio is not). Data provided in mbuf chain must be small * enough to send all at once. * * Returns nonzero on error, timeout or signal; callers * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. * * Returns: 0 Success * EOPNOTSUPP * EINVAL * ENOBUFS * uiomove:EFAULT * sosendcheck:EPIPE * sosendcheck:EWOULDBLOCK * sosendcheck:EINTR * sosendcheck:EBADF * sosendcheck:EINTR * sosendcheck:??? [value from so_error] * :ECONNRESET[TCP] * :EINVAL[TCP] * :ENOBUFS[TCP] * :EADDRINUSE[TCP] * :EADDRNOTAVAIL[TCP] * :EAFNOSUPPORT[TCP] * :EACCES[TCP] * :EAGAIN[TCP] * :EPERM[TCP] * :EMSGSIZE[TCP] * :EHOSTUNREACH[TCP] * :ENETUNREACH[TCP] * :ENETDOWN[TCP] * :ENOMEM[TCP] * :ENOBUFS[TCP] * :???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] * :EINVAL[AF_UNIX] * :EOPNOTSUPP[AF_UNIX] * :EPIPE[AF_UNIX] * :ENOTCONN[AF_UNIX] * :EISCONN[AF_UNIX] * :???[AF_UNIX] [whatever a filter author chooses] * :??? [whatever a filter author chooses] * * Notes: Other returns depend on the protocol family; all * returns depend on what the filter author causes * their filter to return. */ int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags) { struct mbuf **mp; struct mbuf *m, *freelist = NULL; struct soflow_hash_entry *dgram_flow_entry = NULL; user_ssize_t space, len, resid, orig_resid; int clen = 0, error, dontroute, sendflags; int atomic = sosendallatonce(so) || top; int sblocked = 0; struct proc *p = current_proc(); uint16_t headroom = 0; ssize_t mlen; boolean_t en_tracing = FALSE; if (uio != NULL) { resid = uio_resid(uio); } else { resid = top->m_pkthdr.len; } orig_resid = resid; KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid, so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); socket_lock(so, 1); if (NEED_DGRAM_FLOW_TRACKING(so)) { dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0); } /* * trace if tracing & network (vs. unix) sockets & and * non-loopback */ if (ENTR_SHOULDTRACE && (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) { struct inpcb *inp = sotoinpcb(so); if (inp->inp_last_outifp != NULL && !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { en_tracing = TRUE; KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START, VM_KERNEL_ADDRPERM(so), ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0), (int64_t)resid); } } /* * Re-injection should not affect process accounting */ if ((flags & MSG_SKIPCFIL) == 0) { so_update_last_owner_locked(so, p); so_update_policy(so); #if NECP so_update_necp_policy(so, NULL, addr); #endif /* NECP */ } if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { error = EOPNOTSUPP; goto out_locked; } /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. * * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets. * * Note: We limit resid to be a positive int value as we use * imin() to set bytes_to_copy -- radr://14558484 */ if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out_locked; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); if (control != NULL) { clen = control->m_len; } if (soreserveheadroom != 0) { headroom = so->so_pktheadroom; } do { error = sosendcheck(so, addr, resid, clen, atomic, flags, &sblocked); if (error) { goto out_locked; } mp = ⊤ space = sbspace(&so->so_snd) - clen; space += ((flags & MSG_OOB) ? 1024 : 0); do { if (uio == NULL) { /* * Data is prepackaged in "top". */ resid = 0; if (flags & MSG_EOR) { top->m_flags |= M_EOR; } } else { int chainlength; int bytes_to_copy; boolean_t jumbocl; boolean_t bigcl; int bytes_to_alloc; bytes_to_copy = imin((int)resid, (int)space); bytes_to_alloc = bytes_to_copy; if (top == NULL) { bytes_to_alloc += headroom; } if (sosendminchain > 0) { chainlength = 0; } else { chainlength = sosendmaxchain; } /* * Use big 4 KB cluster when the outgoing interface * does not prefer 2 KB clusters */ bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab; /* * Attempt to use larger than system page-size * clusters for large writes only if there is * a jumbo cluster pool and if the socket is * marked accordingly. */ jumbocl = sosendjcl && njcl > 0 && ((so->so_flags & SOF_MULTIPAGES) || sosendjcl_ignore_capab) && bigcl; socket_unlock(so, 0); do { int num_needed; int hdrs_needed = (top == NULL) ? 1 : 0; /* * try to maintain a local cache of mbuf * clusters needed to complete this * write the list is further limited to * the number that are currently needed * to fill the socket this mechanism * allows a large number of mbufs/ * clusters to be grabbed under a single * mbuf lock... if we can't get any * clusters, than fall back to trying * for mbufs if we fail early (or * miscalcluate the number needed) make * sure to release any clusters we * haven't yet consumed. */ if (freelist == NULL && bytes_to_alloc > MBIGCLBYTES && jumbocl) { num_needed = bytes_to_alloc / M16KCLBYTES; if ((bytes_to_alloc - (num_needed * M16KCLBYTES)) >= MINCLSIZE) { num_needed++; } freelist = m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, M16KCLBYTES); /* * Fall back to 4K cluster size * if allocation failed */ } if (freelist == NULL && bytes_to_alloc > MCLBYTES && bigcl) { num_needed = bytes_to_alloc / MBIGCLBYTES; if ((bytes_to_alloc - (num_needed * MBIGCLBYTES)) >= MINCLSIZE) { num_needed++; } freelist = m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, MBIGCLBYTES); /* * Fall back to cluster size * if allocation failed */ } /* * Allocate a cluster as we want to * avoid to split the data in more * that one segment and using MINCLSIZE * would lead us to allocate two mbufs */ if (soreserveheadroom != 0 && freelist == NULL && ((top == NULL && bytes_to_alloc > _MHLEN) || bytes_to_alloc > _MLEN)) { num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) / MCLBYTES; freelist = m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, MCLBYTES); /* * Fall back to a single mbuf * if allocation failed */ } else if (freelist == NULL && bytes_to_alloc > MINCLSIZE) { num_needed = bytes_to_alloc / MCLBYTES; if ((bytes_to_alloc - (num_needed * MCLBYTES)) >= MINCLSIZE) { num_needed++; } freelist = m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, MCLBYTES); /* * Fall back to a single mbuf * if allocation failed */ } /* * For datagram protocols, leave * headroom for protocol headers * in the first cluster of the chain */ if (freelist != NULL && atomic && top == NULL && headroom > 0) { freelist->m_data += headroom; } /* * Fall back to regular mbufs without * reserving the socket headroom */ if (freelist == NULL) { if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) { if (top == NULL) { MGETHDR(freelist, M_WAIT, MT_DATA); } else { MGET(freelist, M_WAIT, MT_DATA); } } if (freelist == NULL) { error = ENOBUFS; socket_lock(so, 0); goto out_locked; } /* * For datagram protocols, * leave room for protocol * headers in first mbuf. */ if (atomic && top == NULL && bytes_to_copy > 0 && bytes_to_copy < MHLEN) { MH_ALIGN(freelist, bytes_to_copy); } } m = freelist; freelist = m->m_next; m->m_next = NULL; if ((m->m_flags & M_EXT)) { mlen = m->m_ext.ext_size - M_LEADINGSPACE(m); } else if ((m->m_flags & M_PKTHDR)) { mlen = MHLEN - M_LEADINGSPACE(m); m_add_crumb(m, PKT_CRUMB_SOSEND); } else { mlen = MLEN - M_LEADINGSPACE(m); } len = imin((int)mlen, bytes_to_copy); chainlength += len; space -= len; error = uiomove(mtod(m, caddr_t), (int)len, uio); resid = uio_resid(uio); m->m_len = (int32_t)len; *mp = m; top->m_pkthdr.len += len; if (error) { break; } mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) { top->m_flags |= M_EOR; } break; } bytes_to_copy = imin((int)resid, (int)space); } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE)); socket_lock(so, 0); if (error) { goto out_locked; } } if (dontroute) { so->so_options |= SO_DONTROUTE; } /* * Compute flags here, for pru_send and NKEs * * If the user set MSG_EOF, the protocol * understands this flag and nothing left to * send then use PRU_SEND_EOF instead of PRU_SEND. */ sendflags = (flags & MSG_OOB) ? PRUS_OOB : ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; if ((flags & MSG_SKIPCFIL) == 0) { /* * Socket filter processing */ error = sflt_data_out(so, addr, &top, &control, (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0); if (error) { if (error == EJUSTRETURN) { error = 0; goto packet_consumed; } goto out_locked; } #if CONTENT_FILTER /* * Content filter processing */ error = cfil_sock_data_out(so, addr, top, control, sendflags, dgram_flow_entry); if (error) { if (error == EJUSTRETURN) { error = 0; goto packet_consumed; } goto out_locked; } #endif /* CONTENT_FILTER */ } error = (*so->so_proto->pr_usrreqs->pru_send) (so, sendflags, top, addr, control, p); packet_consumed: if (dontroute) { so->so_options &= ~SO_DONTROUTE; } clen = 0; control = NULL; top = NULL; mp = ⊤ if (error) { goto out_locked; } } while (resid && space > 0); } while (resid); out_locked: if (resid > orig_resid) { char pname[MAXCOMLEN] = {}; pid_t current_pid = proc_pid(current_proc()); proc_name(current_pid, pname, sizeof(pname)); if (sosend_assert_panic != 0) { panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d", so, resid, orig_resid, pname, current_pid); } else { os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d", so->so_gencnt, resid, orig_resid, pname, current_pid); } } if (sblocked) { sbunlock(&so->so_snd, FALSE); /* will unlock socket */ } else { socket_unlock(so, 1); } if (top != NULL) { m_freem(top); } if (control != NULL) { m_freem(control); } if (freelist != NULL) { m_freem_list(freelist); } if (dgram_flow_entry != NULL) { soflow_free_flow(dgram_flow_entry); } soclearfastopen(so); if (en_tracing) { /* resid passed here is the bytes left in uio */ KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END, VM_KERNEL_ADDRPERM(so), ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0), (int64_t)(orig_resid - resid)); } KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc, space, error); return error; } int sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags) { struct mbuf *m0 = NULL, *control_end = NULL; socket_lock_assert_owned(so); /* * top must points to mbuf chain to be sent. * If control is not NULL, top must be packet header */ VERIFY(top != NULL && (control == NULL || top->m_flags & M_PKTHDR)); /* * If control is not passed in, see if we can get it * from top. */ if (control == NULL && (top->m_flags & M_PKTHDR) == 0) { // Locate start of control if present and start of data for (m0 = top; m0 != NULL; m0 = m0->m_next) { if (m0->m_flags & M_PKTHDR) { top = m0; break; } else if (m0->m_type == MT_CONTROL) { if (control == NULL) { // Found start of control control = m0; } if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) { // Found end of control control_end = m0; } } } if (control_end != NULL) { control_end->m_next = NULL; } } int error = (*so->so_proto->pr_usrreqs->pru_send) (so, sendflags, top, addr, control, current_proc()); return error; } static struct mbuf * mbuf_detach_control_from_list(struct mbuf **mp) { struct mbuf *control = NULL; struct mbuf *m = *mp; if (m->m_type == MT_CONTROL) { struct mbuf *control_end; struct mbuf *n; n = control_end = control = m; /* * Break the chain per mbuf type */ while (n != NULL && n->m_type == MT_CONTROL) { control_end = n; n = n->m_next; } control_end->m_next = NULL; *mp = n; } VERIFY(*mp != NULL); return control; } /* * Supported only connected sockets (no address) without ancillary data * (control mbuf) for atomic protocols */ int sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags) { struct mbuf *m; struct soflow_hash_entry *dgram_flow_entry = NULL; int error, dontroute; int atomic = sosendallatonce(so); int sblocked = 0; struct proc *p = current_proc(); struct mbuf *top = pktlist; bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL); KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt, so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); if (so->so_type != SOCK_DGRAM) { error = EINVAL; os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d", error); goto out; } if (atomic == 0) { error = EINVAL; os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d", error); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d", error); goto out; } if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) { error = EINVAL; os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d", flags, error); goto out; } socket_lock(so, 1); so_update_last_owner_locked(so, p); so_update_policy(so); if (NEED_DGRAM_FLOW_TRACKING(so)) { dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0); } #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (dontroute) { so->so_options |= SO_DONTROUTE; } OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); error = sosendcheck(so, NULL, 0, 0, atomic, flags, &sblocked); if (error) { os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d", error); goto release; } if (!skip_filt) { struct mbuf **prevnextp = NULL; for (m = top; m != NULL; m = m->m_nextpkt) { struct mbuf *control = NULL; struct mbuf *last_control = NULL; struct mbuf *nextpkt; /* * Remove packet from the list of packets */ nextpkt = m->m_nextpkt; if (prevnextp != NULL) { *prevnextp = nextpkt; } else { top = nextpkt; } m->m_nextpkt = NULL; /* * Break the chain per mbuf type */ if (m->m_type == MT_CONTROL) { control = mbuf_detach_control_from_list(&m); } /* * Socket filter processing */ error = sflt_data_out(so, NULL, &m, &control, 0); if (error != 0 && error != EJUSTRETURN) { os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d", error); goto release; } #if CONTENT_FILTER if (error == 0) { /* * Content filter processing */ error = cfil_sock_data_out(so, NULL, m, control, 0, dgram_flow_entry); if (error != 0 && error != EJUSTRETURN) { os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d", error); goto release; } } #endif /* CONTENT_FILTER */ if (error == EJUSTRETURN) { /* * When swallowed by a filter, the packet is not * in the list anymore */ error = 0; } else { /* * Rebuild the mbuf chain of the packet */ if (control != NULL) { last_control->m_next = m; m = control; } /* * Reinsert the packet in the list of packets */ m->m_nextpkt = nextpkt; if (prevnextp != NULL) { *prevnextp = m; } else { top = m; } prevnextp = &m->m_nextpkt; } } } if (top != NULL) { if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) { error = (*so->so_proto->pr_usrreqs->pru_send_list) (so, top, pktcnt, flags); if (error != 0) { os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d", error); } top = NULL; } else { *pktcnt = 0; for (m = top; m != NULL; m = top) { struct mbuf *control = NULL; top = m->m_nextpkt; m->m_nextpkt = NULL; /* * Break the chain per mbuf type */ if (m->m_type == MT_CONTROL) { control = mbuf_detach_control_from_list(&m); } error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, control, current_proc()); if (error != 0) { os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d", error); goto release; } *pktcnt += 1; } } } release: if (dontroute) { so->so_options &= ~SO_DONTROUTE; } if (sblocked) { sbunlock(&so->so_snd, FALSE); /* will unlock socket */ } else { socket_unlock(so, 1); } out: if (top != NULL) { os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d", error); m_freem_list(top); } if (dgram_flow_entry != NULL) { soflow_free_flow(dgram_flow_entry); } KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid, so->so_snd.sb_cc, 0, error); return error; } /* * May return ERESTART when packet is dropped by MAC policy check */ static int soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa, struct mbuf **maddrp, int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait) { int error = 0; struct mbuf *m = *mp; struct mbuf *nextrecord = *nextrecordp; KASSERT(m->m_type == MT_SONAME, ("receive 1a")); #if CONFIG_MACF_SOCKET_SUBSET /* * Call the MAC framework for policy checking if we're in * the user process context and the socket isn't connected. */ if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) { struct mbuf *m0 = m; /* * Dequeue this record (temporarily) from the receive * list since we're about to drop the socket's lock * where a new record may arrive and be appended to * the list. Upon MAC policy failure, the record * will be freed. Otherwise, we'll add it back to * the head of the list. We cannot rely on SB_LOCK * because append operation uses the socket's lock. */ do { m->m_nextpkt = NULL; sbfree(&so->so_rcv, m); m = m->m_next; } while (m != NULL); m = m0; so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a"); socket_unlock(so, 0); error = mac_socket_check_received(kauth_cred_get(), so, mtod(m, struct sockaddr *)); if (error != 0) { /* * MAC policy failure; free this record and * process the next record (or block until * one is available). We have adjusted sb_cc * and sb_mbcnt above so there is no need to * call sbfree() again. */ m_freem(m); /* * Clear SB_LOCK but don't unlock the socket. * Process the next record or wait for one. */ socket_lock(so, 0); sbunlock(&so->so_rcv, TRUE); /* stay locked */ error = ERESTART; goto done; } socket_lock(so, 0); /* * If the socket has been defunct'd, drop it. */ if (so->so_flags & SOF_DEFUNCT) { m_freem(m); error = ENOTCONN; goto done; } /* * Re-adjust the socket receive list and re-enqueue * the record in front of any packets which may have * been appended while we dropped the lock. */ for (m = m0; m->m_next != NULL; m = m->m_next) { sballoc(&so->so_rcv, m); } sballoc(&so->so_rcv, m); if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_lastrecord = m0; so->so_rcv.sb_mbtail = m; } m = m0; nextrecord = m->m_nextpkt = so->so_rcv.sb_mb; so->so_rcv.sb_mb = m; SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b"); } #endif /* CONFIG_MACF_SOCKET_SUBSET */ if (psa != NULL) { *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait); if ((*psa == NULL) && (flags & MSG_NEEDSA)) { error = EWOULDBLOCK; goto done; } } else if (maddrp != NULL) { *maddrp = m; } if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); if (m->m_next == NULL && so->so_rcv.sb_cc != 0) { panic("%s: about to create invalid socketbuf", __func__); /* NOTREACHED */ } if (maddrp == NULL) { MFREE(m, so->so_rcv.sb_mb); } else { so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; } m = so->so_rcv.sb_mb; if (m != NULL) { m->m_nextpkt = nextrecord; } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } } done: *mp = m; *nextrecordp = nextrecord; return error; } /* * When peeking SCM_RIGHTS, the actual file descriptors are not yet created * so clear the data portion in order not to leak the file pointers */ static void sopeek_scm_rights(struct mbuf *rights) { struct cmsghdr *cm = mtod(rights, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { VERIFY(cm->cmsg_len <= rights->m_len); memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm)); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization. */ static int soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags, struct mbuf **mp, struct mbuf **nextrecordp) { int error = 0; struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; struct sockbuf *sb_rcv = &so->so_rcv; struct mbuf **msgpcm = NULL; struct mbuf *m = *mp; struct mbuf *nextrecord = *nextrecordp; struct protosw *pr = so->so_proto; /* * Externalizing the control messages would require us to * drop the socket's lock below. Once we re-acquire the * lock, the mbuf chain might change. In order to preserve * consistency, we unlink all control messages from the * first mbuf chain in one shot and link them separately * onto a different chain. */ do { if (flags & MSG_PEEK) { if (controlp != NULL) { if (*controlp == NULL) { msgpcm = controlp; } *controlp = m_copy(m, 0, m->m_len); /* * If we failed to allocate an mbuf, * release any previously allocated * mbufs for control data. Return * an error. Keep the mbufs in the * socket as this is using * MSG_PEEK flag. */ if (*controlp == NULL) { m_freem(*msgpcm); error = ENOBUFS; goto done; } if (pr->pr_domain->dom_externalize != NULL) { sopeek_scm_rights(*controlp); } controlp = &(*controlp)->m_next; } m = m->m_next; } else { m->m_nextpkt = NULL; sbfree(sb_rcv, m); sb_rcv->sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = sb_rcv->sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if (!(flags & MSG_PEEK)) { if (sb_rcv->sb_mb != NULL) { sb_rcv->sb_mb->m_nextpkt = nextrecord; } else { sb_rcv->sb_mb = nextrecord; SB_EMPTY_FIXUP(sb_rcv); } if (nextrecord == NULL) { sb_rcv->sb_lastrecord = m; } } SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl"); SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl"); while (cm != NULL) { int cmsg_level; int cmsg_type; cmn = cm->m_next; cm->m_next = NULL; cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level; cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type; /* * Call the protocol to externalize SCM_RIGHTS message * and return the modified message to the caller upon * success. Otherwise, all other control messages are * returned unmodified to the caller. Note that we * only get into this loop if MSG_PEEK is not set. */ if (pr->pr_domain->dom_externalize != NULL && cmsg_level == SOL_SOCKET && cmsg_type == SCM_RIGHTS) { /* * Release socket lock: see 3903171. This * would also allow more records to be appended * to the socket buffer. We still have SB_LOCK * set on it, so we can be sure that the head * of the mbuf chain won't change. */ socket_unlock(so, 0); error = (*pr->pr_domain->dom_externalize)(cm); socket_lock(so, 0); } else { error = 0; } if (controlp != NULL && error == 0) { *controlp = cm; controlp = &(*controlp)->m_next; } else { (void) m_free(cm); } cm = cmn; } /* * Update the value of nextrecord in case we received new * records when the socket was unlocked above for * externalizing SCM_RIGHTS. */ if (m != NULL) { nextrecord = sb_rcv->sb_mb->m_nextpkt; } else { nextrecord = sb_rcv->sb_mb; } done: *mp = m; *nextrecordp = nextrecord; return error; } /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat). * 3. MSG_DONTWAIT is not set * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ static boolean_t so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags) { struct protosw *pr = so->so_proto; /* No mbufs in the receive-queue? Wait! */ if (m == NULL) { return true; } /* Not enough data in the receive socket-buffer - we may have to wait */ if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) { /* * Application did set the lowater-mark, so we should wait for * this data to be present. */ if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) { return true; } /* * Application wants all the data - so let's try to do the * receive-operation at once by waiting for everything to * be there. */ if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) { return true; } } return false; } /* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) * must begin with an address if the protocol so specifies, * followed by an optional mbuf or mbufs containing ancillary data, * and then zero or more mbufs of data. * In order to avoid blocking network interrupts for the entire time here, * we splx() while doing the actual copy to user space. * Although the sockbuf is locked, new data may still be appended, * and thus we must maintain consistency of the sockbuf during that time. * * The caller may receive the data as a single mbuf chain by supplying * an mbuf **mp0 for use in returning the chain. The uio is then used * only for the count in uio_resid. * * Returns: 0 Success * ENOBUFS * ENOTCONN * EWOULDBLOCK * uiomove:EFAULT * sblock:EWOULDBLOCK * sblock:EINTR * sbwait:EBADF * sbwait:EINTR * sodelayed_copy:EFAULT * :EINVAL[TCP] * :EWOULDBLOCK[TCP] * :??? * dom_externalize>:EMSGSIZE[AF_UNIX] * dom_externalize>:ENOBUFS[AF_UNIX] * dom_externalize>:??? * * Notes: Additional return values from calls through and * dom_externalize> depend on protocols other than * TCP or AF_UNIX, which are documented above. */ int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp, *ml = NULL; struct mbuf *nextrecord, *free_list; int flags, error, offset; user_ssize_t len; struct protosw *pr = so->so_proto; int moff, type = 0; user_ssize_t orig_resid = uio_resid(uio); user_ssize_t delayed_copy_len; int can_delay; struct proc *p = current_proc(); boolean_t en_tracing = FALSE; /* * Sanity check on the length passed by caller as we are making 'int' * comparisons */ if (orig_resid < 0 || orig_resid > INT_MAX) { return EINVAL; } KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); socket_lock(so, 1); so_update_last_owner_locked(so, p); so_update_policy(so); #ifdef MORE_LOCKING_DEBUG if (so->so_usecount == 1) { panic("%s: so=%x no other reference on socket", __func__, so); /* NOTREACHED */ } #endif mp = mp0; if (psa != NULL) { *psa = NULL; } if (controlp != NULL) { *controlp = NULL; } if (flagsp != NULL) { flags = *flagsp & ~MSG_EOR; } else { flags = 0; } /* * If a recv attempt is made on a previously-accepted socket * that has been marked as inactive (disconnected), reject * the request. */ if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should * be no data on its receive list, so panic otherwise. */ if (so->so_state & SS_DEFUNCT) { sb_empty_assert(sb, __func__); } socket_unlock(so, 1); return error; } if ((so->so_flags1 & SOF1_PRECONNECT_DATA) && pr->pr_usrreqs->pru_preconnect) { /* * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not * calling write() right after this. *If* the app calls a read * we do not want to block this read indefinetely. Thus, * we trigger a connect so that the session gets initiated. */ error = (*pr->pr_usrreqs->pru_preconnect)(so); if (error) { socket_unlock(so, 1); return error; } } if (ENTR_SHOULDTRACE && (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) { /* * enable energy tracing for inet sockets that go over * non-loopback interfaces only. */ struct inpcb *inp = sotoinpcb(so); if (inp->inp_last_outifp != NULL && !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { en_tracing = TRUE; KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START, VM_KERNEL_ADDRPERM(so), ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0), (int64_t)orig_resid); } } /* * When SO_WANTOOBFLAG is set we try to get out-of-band data * regardless of the flags argument. Here is the case were * out-of-band data is not inline. */ if ((flags & MSG_OOB) || ((so->so_options & SO_WANTOOBFLAG) != 0 && (so->so_options & SO_OOBINLINE) == 0 && (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) { m = m_get(M_WAIT, MT_DATA); if (m == NULL) { socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, ENOBUFS, 0, 0, 0, 0); return ENOBUFS; } error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) { goto bad; } socket_unlock(so, 0); do { error = uiomove(mtod(m, caddr_t), imin((int)uio_resid(uio), m->m_len), uio); m = m_free(m); } while (uio_resid(uio) && error == 0 && m != NULL); socket_lock(so, 0); bad: if (m != NULL) { m_freem(m); } if ((so->so_options & SO_WANTOOBFLAG) != 0) { if (error == EWOULDBLOCK || error == EINVAL) { /* * Let's try to get normal data: * EWOULDBLOCK: out-of-band data not * receive yet. EINVAL: out-of-band data * already read. */ error = 0; goto nooob; } else if (error == 0 && flagsp != NULL) { *flagsp |= MSG_OOB; } } socket_unlock(so, 1); if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END, VM_KERNEL_ADDRPERM(so), 0, (int64_t)(orig_resid - uio_resid(uio))); } KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 0, 0, 0, 0); return error; } nooob: if (mp != NULL) { *mp = NULL; } if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) { (*pr->pr_usrreqs->pru_rcvd)(so, 0); } free_list = NULL; delayed_copy_len = 0; restart: #ifdef MORE_LOCKING_DEBUG if (so->so_usecount <= 1) { printf("soreceive: sblock so=0x%llx ref=%d on socket\n", (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount); } #endif /* * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) * and if so just return to the caller. This could happen when * soreceive() is called by a socket upcall function during the * time the socket is freed. The socket buffer would have been * locked across the upcall, therefore we cannot put this thread * to sleep (else we will deadlock) or return EWOULDBLOCK (else * we may livelock), because the lock on the socket buffer will * only be released when the upcall routine returns to its caller. * Because the socket has been officially closed, there can be * no further read on it. * * A multipath subflow socket would have its SS_NOFDREF set by * default, so check for SOF_MP_SUBFLOW socket flag; when the * socket is closed for real, SOF_MP_SUBFLOW would be cleared. */ if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { socket_unlock(so, 1); return 0; } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) { socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 0, 0, 0, 0); if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END, VM_KERNEL_ADDRPERM(so), 0, (int64_t)(orig_resid - uio_resid(uio))); } return error; } m = so->so_rcv.sb_mb; if (so_should_wait(so, uio, m, flags)) { /* * Panic if we notice inconsistencies in the socket's * receive list; both sb_mb and sb_cc should correctly * reflect the contents of the list, otherwise we may * end up with false positives during select() or poll() * which could put the application in a bad state. */ SB_MB_CHECK(&so->so_rcv); if (so->so_error) { if (m != NULL) { goto dontblock; } error = so->so_error; if ((flags & MSG_PEEK) == 0) { so->so_error = 0; } goto release; } if (so->so_state & SS_CANTRCVMORE) { #if CONTENT_FILTER /* * Deal with half closed connections */ if ((so->so_state & SS_ISDISCONNECTED) == 0 && cfil_sock_data_pending(&so->so_rcv) != 0) { CFIL_LOG(LOG_INFO, "so %llx ignore SS_CANTRCVMORE", (uint64_t)DEBUG_KERNEL_ADDRPERM(so)); } else #endif /* CONTENT_FILTER */ if (m != NULL) { goto dontblock; } else { goto release; } } for (; m != NULL; m = m->m_next) { if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } } if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if (uio_resid(uio) == 0) { goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT | MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ #if EVEN_MORE_LOCKING_DEBUG if (socket_debug) { printf("Waiting for socket data\n"); } #endif /* * Depending on the protocol (e.g. TCP), the following * might cause the socket lock to be dropped and later * be reacquired, and more data could have arrived and * have been appended to the receive socket buffer by * the time it returns. Therefore, we only sleep in * sbwait() below if and only if the wait-condition is still * true. */ if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) { (*pr->pr_usrreqs->pru_rcvd)(so, flags); } error = 0; if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) { error = sbwait(&so->so_rcv); } #if EVEN_MORE_LOCKING_DEBUG if (socket_debug) { printf("SORECEIVE - sbwait returned %d\n", error); } #endif if (so->so_usecount < 1) { panic("%s: after 2nd sblock so=%p ref=%d on socket", __func__, so, so->so_usecount); /* NOTREACHED */ } if (error) { socket_unlock(so, 1); KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error, 0, 0, 0, 0); if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END, VM_KERNEL_ADDRPERM(so), 0, (int64_t)(orig_resid - uio_resid(uio))); } return error; } goto restart; } dontblock: OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { error = soreceive_addr(p, so, psa, NULL, flags, &m, &nextrecord, mp0 == NULL); if (error == ERESTART) { goto restart; } else if (error != 0) { goto release; } orig_resid = 0; } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization. */ if (m != NULL && m->m_type == MT_CONTROL) { error = soreceive_ctl(so, controlp, flags, &m, &nextrecord); if (error != 0) { goto release; } orig_resid = 0; } if (m != NULL) { if (!(flags & MSG_PEEK)) { /* * We get here because m points to an mbuf following * any MT_SONAME or MT_CONTROL mbufs which have been * processed above. In any case, m should be pointing * to the head of the mbuf chain, and the nextrecord * should be either NULL or equal to m->m_nextpkt. * See comments above about SB_LOCK. */ if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord) { panic("%s: post-control !sync so=%p m=%p " "nextrecord=%p\n", __func__, so, m, nextrecord); /* NOTREACHED */ } if (nextrecord == NULL) { so->so_rcv.sb_lastrecord = m; } } type = m->m_type; if (type == MT_OOBDATA) { flags |= MSG_OOB; } } else { if (!(flags & MSG_PEEK)) { SB_EMPTY_FIXUP(&so->so_rcv); } } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); moff = 0; offset = 0; if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) { can_delay = 1; } else { can_delay = 0; } while (m != NULL && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) { break; } } else if (type == MT_OOBDATA) { break; } if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { break; } /* * Make sure to allways set MSG_OOB event when getting * out of band data inline. */ if ((so->so_options & SO_WANTOOBFLAG) != 0 && (so->so_options & SO_OOBINLINE) != 0 && (so->so_state & SS_RCVATMARK) != 0) { flags |= MSG_OOB; } so->so_state &= ~SS_RCVATMARK; len = uio_resid(uio) - delayed_copy_len; if (so->so_oobmark && len > so->so_oobmark - offset) { len = so->so_oobmark - offset; } if (len > m->m_len - moff) { len = m->m_len - moff; } /* * If mp is set, just pass back the mbufs. * Otherwise copy them out via the uio, then free. * Sockbuf must be consistent here (points to current mbuf, * it points to next record) when we drop priority; * we must note any additions to the sockbuf when we * block interrupts again. */ if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); if (can_delay && len == m->m_len) { /* * only delay the copy if we're consuming the * mbuf and we're NOT in MSG_PEEK mode * and we have enough data to make it worthwile * to drop and retake the lock... can_delay * reflects the state of the 2 latter * constraints moff should always be zero * in these cases */ delayed_copy_len += len; } else { if (delayed_copy_len) { error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) { goto release; } /* * can only get here if MSG_PEEK is not * set therefore, m should point at the * head of the rcv queue; if it doesn't, * it means something drastically * changed while we were out from behind * the lock in sodelayed_copy. perhaps * a RST on the stream. in any event, * the stream has been interrupted. it's * probably best just to return whatever * data we've moved and let the caller * sort it out... */ if (m != so->so_rcv.sb_mb) { break; } } socket_unlock(so, 0); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); socket_lock(so, 0); if (error) { goto release; } } } else { uio_setresid(uio, (uio_resid(uio) - len)); } if (len == m->m_len - moff) { if (m->m_flags & M_EOR) { flags |= MSG_EOR; } if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); m->m_nextpkt = NULL; if (mp != NULL) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { if (free_list == NULL) { free_list = m; } else { ml->m_next = m; } ml = m; so->so_rcv.sb_mb = m = m->m_next; ml->m_next = NULL; } if (m != NULL) { m->m_nextpkt = nextrecord; if (nextrecord == NULL) { so->so_rcv.sb_lastrecord = m; } } else { so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } } else { if (flags & MSG_PEEK) { moff += len; } else { if (mp != NULL) { int copy_flag; if (flags & MSG_DONTWAIT) { copy_flag = M_DONTWAIT; } else { copy_flag = M_WAIT; } *mp = m_copym(m, 0, (int)len, copy_flag); /* * Failed to allocate an mbuf? * Adjust uio_resid back, it was * adjusted down by len bytes which * we didn't copy over. */ if (*mp == NULL) { uio_setresid(uio, (uio_resid(uio) + len)); break; } } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) { break; } } } if (flags & MSG_EOR) { break; } /* * If the MSG_WAITALL or MSG_WAITSTREAM flag is set * (for non-atomic socket), we must not quit until * "uio->uio_resid == 0" or an error termination. * If a signal/timeout occurs, return with a short * count but without error. Keep sockbuf locked * against other readers. */ while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL && (uio_resid(uio) - delayed_copy_len) > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || ((so->so_state & SS_CANTRCVMORE) #if CONTENT_FILTER && cfil_sock_data_pending(&so->so_rcv) == 0 #endif /* CONTENT_FILTER */ )) { goto release; } /* * Depending on the protocol (e.g. TCP), the following * might cause the socket lock to be dropped and later * be reacquired, and more data could have arrived and * have been appended to the receive socket buffer by * the time it returns. Therefore, we only sleep in * sbwait() below if and only if the socket buffer is * empty, in order to avoid a false sleep. */ if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) { (*pr->pr_usrreqs->pru_rcvd)(so, flags); } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) { error = 0; goto release; } /* * have to wait until after we get back from the sbwait * to do the copy because we will drop the lock if we * have enough data that has been delayed... by dropping * the lock we open up a window allowing the netisr * thread to process the incoming packets and to change * the state of this socket... we're issuing the sbwait * because the socket is empty and we're expecting the * netisr thread to wake us up when more packets arrive; * if we allow that processing to happen and then sbwait * we could stall forever with packets sitting in the * socket if no further packets arrive from the remote * side. * * we want to copy before we've collected all the data * to satisfy this request to allow the copy to overlap * the incoming packet processing on an MP system */ if (delayed_copy_len > sorecvmincopy && (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) { error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) { goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) { nextrecord = m->m_nextpkt; } SB_MB_CHECK(&so->so_rcv); } } #ifdef MORE_LOCKING_DEBUG if (so->so_usecount <= 1) { panic("%s: after big while so=%p ref=%d on socket", __func__, so, so->so_usecount); /* NOTREACHED */ } #endif if (m != NULL && pr->pr_flags & PR_ATOMIC) { if (so->so_options & SO_DONTTRUNC) { flags |= MSG_RCVMORE; } else { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) { (void) sbdroprecord(&so->so_rcv); } } } /* * pru_rcvd below (for TCP) may cause more data to be received * if the socket lock is dropped prior to sending the ACK; some * legacy OpenTransport applications don't handle this well * (if it receives less data than requested while MSG_HAVEMORE * is set), and so we set the flag now based on what we know * prior to calling pru_rcvd. */ if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) { flags |= MSG_HAVEMORE; } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { so->so_rcv.sb_mb = nextrecord; /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) { so->so_rcv.sb_lastrecord = nextrecord; } SB_MB_CHECK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { (*pr->pr_usrreqs->pru_rcvd)(so, flags); } } if (delayed_copy_len) { error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); if (error) { goto release; } } if (free_list != NULL) { m_freem_list(free_list); free_list = NULL; } if (orig_resid == uio_resid(uio) && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ goto restart; } if (flagsp != NULL) { *flagsp |= flags; } release: #ifdef MORE_LOCKING_DEBUG if (so->so_usecount <= 1) { panic("%s: release so=%p ref=%d on socket", __func__, so, so->so_usecount); /* NOTREACHED */ } #endif if (delayed_copy_len) { error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len); } if (free_list != NULL) { m_freem_list(free_list); } sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END, VM_KERNEL_ADDRPERM(so), ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0), (int64_t)(orig_resid - uio_resid(uio))); } KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio), so->so_rcv.sb_cc, 0, error); return error; } /* * Returns: 0 Success * uiomove:EFAULT */ static int sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list, user_ssize_t *resid) { int error = 0; struct mbuf *m; m = *free_list; socket_unlock(so, 0); while (m != NULL && error == 0) { error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio); m = m->m_next; } m_freem_list(*free_list); *free_list = NULL; *resid = 0; socket_lock(so, 0); return error; } int soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; struct mbuf *nextrecord; int flags, error; struct protosw *pr = so->so_proto; struct proc *p = current_proc(); u_int npkts = 0; struct mbuf *free_list = NULL; int sblocked = 0; /* * Sanity check on the parameters passed by caller */ if (mp0 == NULL || pktcntp == NULL) { return EINVAL; } if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) { return EINVAL; } mp = mp0; *mp0 = NULL; if (controlp != NULL) { *controlp = NULL; } if (maddrp != NULL) { *maddrp = NULL; } if (flagsp != NULL) { flags = *flagsp; } else { flags = 0; } KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so, *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); socket_lock(so, 1); so_update_last_owner_locked(so, p); so_update_policy(so); #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ /* * If a recv attempt is made on a previously-accepted socket * that has been marked as inactive (disconnected), reject * the request. */ if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n", __func__, proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), error); /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should * be no data on its receive list, so panic otherwise. */ if (so->so_state & SS_DEFUNCT) { sb_empty_assert(sb, __func__); } goto release; } *mp = NULL; restart: /* * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) * and if so just return to the caller. This could happen when * soreceive() is called by a socket upcall function during the * time the socket is freed. The socket buffer would have been * locked across the upcall, therefore we cannot put this thread * to sleep (else we will deadlock) or return EWOULDBLOCK (else * we may livelock), because the lock on the socket buffer will * only be released when the upcall routine returns to its caller. * Because the socket has been officially closed, there can be * no further read on it. */ if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == (SS_NOFDREF | SS_CANTRCVMORE)) { error = 0; goto out; } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) { goto out; } sblocked = 1; m = so->so_rcv.sb_mb; /* * Block awaiting more datagram if needed */ if (m == NULL || ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) { /* * Panic if we notice inconsistencies in the socket's * receive list; both sb_mb and sb_cc should correctly * reflect the contents of the list, otherwise we may * end up with false positives during select() or poll() * which could put the application in a bad state. */ SB_MB_CHECK(&so->so_rcv); if (so->so_error) { if (m != NULL) { goto dontblock; } error = so->so_error; if ((flags & MSG_PEEK) == 0) { so->so_error = 0; } goto release; } if (so->so_state & SS_CANTRCVMORE) { if (m != NULL) { goto dontblock; } else { goto release; } } for (; m != NULL; m = m->m_next) { if (m->m_flags & M_EOR) { m = so->so_rcv.sb_mb; goto dontblock; } } if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { error = ENOTCONN; goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT | MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv, TRUE); /* keep socket locked */ sblocked = 0; error = sbwait(&so->so_rcv); if (error != 0) { goto release; } goto restart; } dontblock: m = so->so_rcv.sb_mb; if (m == NULL) { goto release; } OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) { struct mbuf *maddr = NULL; error = soreceive_addr(p, so, NULL, &maddr, flags, &m, &nextrecord, 1); if (error == ERESTART) { goto restart; } else if (error != 0) { goto release; } if (maddr != NULL) { maddr->m_nextpkt = NULL; maddr->m_next = NULL; if (maddrp != NULL) { *maddrp = maddr; maddrp = &maddr->m_nextpkt; } else { maddr->m_next = free_list; free_list = maddr; } } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. * We call into the protocol to perform externalization. */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *control = NULL; error = soreceive_ctl(so, &control, flags, &m, &nextrecord); if (error != 0) { goto release; } if (control != NULL) { control->m_nextpkt = NULL; control->m_next = NULL; if (controlp != NULL) { *controlp = control; controlp = &control->m_nextpkt; } else { control->m_next = free_list; free_list = control; } } } /* * Link the packet to the list */ if (m != NULL) { if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type); } m->m_nextpkt = NULL; *mp = m; mp = &m->m_nextpkt; } while (m != NULL) { sbfree(&so->so_rcv, m); m = m->m_next; } so->so_rcv.sb_mb = nextrecord; /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) { so->so_rcv.sb_lastrecord = nextrecord; } SB_MB_CHECK(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); npkts += 1; /* * We continue as long as all those conditions as we have less packets * than requested and the socket buffer is not empty */ if (npkts < *pktcntp) { if (so->so_rcv.sb_mb != NULL) { goto dontblock; } if ((flags & MSG_WAITALL) != 0) { goto restart; } } if (flagsp != NULL) { *flagsp |= flags; } release: /* * pru_rcvd may cause more data to be received if the socket lock * is dropped so we set MSG_HAVEMORE now based on what we know. * That way the caller won't be surprised if it receives less data * than requested. */ if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) { flags |= MSG_HAVEMORE; } if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { (*pr->pr_usrreqs->pru_rcvd)(so, flags); } if (sblocked) { sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ } else { socket_unlock(so, 1); } out: *pktcntp = npkts; /* * Amortize the cost of freeing the mbufs */ if (free_list != NULL) { m_freem_list(free_list); } KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error, 0, 0, 0, 0); return error; } static int so_statistics_event_to_nstat_event(int64_t *input_options, uint64_t *nstat_event) { int error = 0; switch (*input_options) { case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK: *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK; break; case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK: *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK; break; case SO_STATISTICS_EVENT_ATTRIBUTION_CHANGE: *nstat_event = NSTAT_EVENT_SRC_ATTRIBUTION_CHANGE; break; #if (DEBUG || DEVELOPMENT) case SO_STATISTICS_EVENT_RESERVED_2: *nstat_event = NSTAT_EVENT_SRC_RESERVED_2; break; #endif /* (DEBUG || DEVELOPMENT) */ default: error = EINVAL; break; } return error; } /* * Returns: 0 Success * EINVAL * ENOTCONN * :EINVAL * :EADDRNOTAVAIL[TCP] * :ENOBUFS[TCP] * :EMSGSIZE[TCP] * :EHOSTUNREACH[TCP] * :ENETUNREACH[TCP] * :ENETDOWN[TCP] * :ENOMEM[TCP] * :EACCES[TCP] * :EMSGSIZE[TCP] * :ENOBUFS[TCP] * :???[TCP] [ignorable: mostly IPSEC/firewall/DLIL] * :??? [other protocol families] */ int soshutdown(struct socket *so, int how) { int error; KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0); switch (how) { case SHUT_RD: case SHUT_WR: case SHUT_RDWR: socket_lock(so, 1); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { error = ENOTCONN; } else { error = soshutdownlock(so, how); } socket_unlock(so, 1); break; default: error = EINVAL; break; } KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0); return error; } int soshutdownlock_final(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error = 0; sflt_notify(so, sock_evt_shutdown, &how); if (how != SHUT_WR) { if ((so->so_state & SS_CANTRCVMORE) != 0) { /* read already shut down */ error = ENOTCONN; goto done; } sorflush(so); } if (how != SHUT_RD) { if ((so->so_state & SS_CANTSENDMORE) != 0) { /* write already shut down */ error = ENOTCONN; goto done; } error = (*pr->pr_usrreqs->pru_shutdown)(so); } done: KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0); return error; } int soshutdownlock(struct socket *so, int how) { int error = 0; #if CONTENT_FILTER /* * A content filter may delay the actual shutdown until it * has processed the pending data */ if (so->so_flags & SOF_CONTENT_FILTER) { error = cfil_sock_shutdown(so, &how); if (error == EJUSTRETURN) { error = 0; goto done; } else if (error != 0) { goto done; } } #endif /* CONTENT_FILTER */ error = soshutdownlock_final(so, how); done: return error; } void sowflush(struct socket *so) { struct sockbuf *sb = &so->so_snd; /* * Obtain lock on the socket buffer (SB_LOCK). This is required * to prevent the socket buffer from being unexpectedly altered * while it is used by another thread in socket send/receive. * * sblock() must not fail here, hence the assertion. */ (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); VERIFY(sb->sb_flags & SB_LOCK); sb->sb_flags &= ~(SB_SEL | SB_UPCALL); sb->sb_flags |= SB_DROP; sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sbunlock(sb, TRUE); /* keep socket locked */ selthreadclear(&sb->sb_sel); sbrelease(sb); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct sockbuf asb; #ifdef notyet lck_mtx_t *mutex_held; /* * XXX: This code is currently commented out, because we may get here * as part of sofreelastref(), and at that time, pr_getlock() may no * longer be able to return us the lock; this will be fixed in future. */ if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, 0); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); #endif /* notyet */ sflt_notify(so, sock_evt_flush_read, NULL); socantrcvmore(so); /* * Obtain lock on the socket buffer (SB_LOCK). This is required * to prevent the socket buffer from being unexpectedly altered * while it is used by another thread in socket send/receive. * * sblock() must not fail here, hence the assertion. */ (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT); VERIFY(sb->sb_flags & SB_LOCK); /* * Copy only the relevant fields from "sb" to "asb" which we * need for sbrelease() to function. In particular, skip * sb_sel as it contains the wait queue linkage, which would * wreak havoc if we were to issue selthreadclear() on "asb". * Make sure to not carry over SB_LOCK in "asb", as we need * to acquire it later as part of sbrelease(). */ bzero(&asb, sizeof(asb)); asb.sb_cc = sb->sb_cc; asb.sb_hiwat = sb->sb_hiwat; asb.sb_mbcnt = sb->sb_mbcnt; asb.sb_mbmax = sb->sb_mbmax; asb.sb_ctl = sb->sb_ctl; asb.sb_lowat = sb->sb_lowat; asb.sb_mb = sb->sb_mb; asb.sb_mbtail = sb->sb_mbtail; asb.sb_lastrecord = sb->sb_lastrecord; asb.sb_so = sb->sb_so; asb.sb_flags = sb->sb_flags; asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL); asb.sb_flags |= SB_DROP; /* * Ideally we'd bzero() these and preserve the ones we need; * but to do that we'd need to shuffle things around in the * sockbuf, and we can't do it now because there are KEXTS * that are directly referring to the socket structure. * * Setting SB_DROP acts as a barrier to prevent further appends. * Clearing SB_SEL is done for selthreadclear() below. */ sb->sb_cc = 0; sb->sb_hiwat = 0; sb->sb_mbcnt = 0; sb->sb_mbmax = 0; sb->sb_ctl = 0; sb->sb_lowat = 0; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; sb->sb_timeo.tv_sec = 0; sb->sb_timeo.tv_usec = 0; sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~(SB_SEL | SB_UPCALL); sb->sb_flags |= SB_DROP; sbunlock(sb, TRUE); /* keep socket locked */ /* * Note that selthreadclear() is called on the original "sb" and * not the local "asb" because of the way wait queue linkage is * implemented. Given that selwakeup() may be triggered, SB_SEL * should no longer be set (cleared above.) */ selthreadclear(&sb->sb_sel); if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) { (*pr->pr_domain->dom_dispose)(asb.sb_mb); } sbrelease(&asb); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in * an additional variant to handle the case where the option value needs * to be some kind of integer, but not a specific size. * In addition to their use here, these functions are also called by the * protocol-level pr_ctloutput() routines. * * Returns: 0 Success * EINVAL * copyin:EFAULT */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, * but if we don't get the minimum length the caller * wants, we return EINVAL. On success, sopt->sopt_valsize * is set to however much we actually retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) { return EINVAL; } if (valsize > len) { sopt->sopt_valsize = valsize = len; } if (sopt->sopt_p != kernproc) { return copyin(sopt->sopt_val, buf, valsize); } bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize); return 0; } /* * sooptcopyin_timeval * Copy in a timeval value into tv_p, and take into account whether the * the calling process is 64-bit or 32-bit. Moved the sanity checking * code here so that we can verify the 64-bit tv_sec value before we lose * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec. */ static int sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p) { int error; if (proc_is64bit(sopt->sopt_p)) { struct user64_timeval tv64; if (sopt->sopt_valsize < sizeof(tv64)) { return EINVAL; } sopt->sopt_valsize = sizeof(tv64); if (sopt->sopt_p != kernproc) { error = copyin(sopt->sopt_val, &tv64, sizeof(tv64)); if (error != 0) { return error; } } else { bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64, sizeof(tv64)); } if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) { return EDOM; } tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec; tv_p->tv_usec = tv64.tv_usec; } else { struct user32_timeval tv32; if (sopt->sopt_valsize < sizeof(tv32)) { return EINVAL; } sopt->sopt_valsize = sizeof(tv32); if (sopt->sopt_p != kernproc) { error = copyin(sopt->sopt_val, &tv32, sizeof(tv32)); if (error != 0) { return error; } } else { bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32, sizeof(tv32)); } #ifndef __LP64__ /* * K64todo "comparison is always false due to * limited range of data type" */ if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) { return EDOM; } #endif tv_p->tv_sec = tv32.tv_sec; tv_p->tv_usec = tv32.tv_usec; } return 0; } int soopt_cred_check(struct socket *so, int priv, boolean_t allow_root, boolean_t ignore_delegate) { kauth_cred_t cred = NULL; proc_t ep = PROC_NULL; uid_t uid; int error = 0; if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) { ep = proc_find(so->e_pid); if (ep) { cred = kauth_cred_proc_ref(ep); } } uid = kauth_cred_getuid(cred ? cred : so->so_cred); /* uid is 0 for root */ if (uid != 0 || !allow_root) { error = priv_check_cred(cred ? cred : so->so_cred, priv, 0); } if (cred) { kauth_cred_unref(&cred); } if (ep != PROC_NULL) { proc_rele(ep); } return error; } /* * Returns: 0 Success * EINVAL * ENOPROTOOPT * ENOBUFS * EDOM * sooptcopyin:EINVAL * sooptcopyin:EFAULT * sooptcopyin_timeval:EINVAL * sooptcopyin_timeval:EFAULT * sooptcopyin_timeval:EDOM * :EOPNOTSUPP[AF_UNIX] * :???w * sflt_attach_private:??? [whatever a filter author chooses] * :??? [whatever a filter author chooses] * * Notes: Other returns depend on the protocol family; all * returns depend on what the filter author causes * their filter to return. */ int sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) { int error, optval; int64_t long_optval; struct linger l; struct timeval tv; if (sopt->sopt_dir != SOPT_SET) { sopt->sopt_dir = SOPT_SET; } if (dolock) { socket_lock(so, 1); } if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == (SS_CANTRCVMORE | SS_CANTSENDMORE) && (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { /* the socket has been shutdown, no more sockopt's */ error = EINVAL; goto out; } error = sflt_setsockopt(so, sopt); if (error != 0) { if (error == EJUSTRETURN) { error = 0; } goto out; } if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { error = (*so->so_proto->pr_ctloutput)(so, sopt); goto out; } error = ENOPROTOOPT; } else { /* * Allow socket-level (SOL_SOCKET) options to be filtered by * the protocol layer, if needed. A zero value returned from * the handler means use default socket-level processing as * done by the rest of this routine. Otherwise, any other * return value indicates that the option is unsupported. */ if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> pru_socheckopt(so, sopt)) != 0) { goto out; } error = 0; switch (sopt->sopt_name) { case SO_LINGER: case SO_LINGER_SEC: { error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l)); if (error != 0) { goto out; } /* Make sure to use sane values */ if (sopt->sopt_name == SO_LINGER) { so->so_linger = (short)l.l_linger; } else { so->so_linger = (short)((long)l.l_linger * hz); } if (l.l_onoff != 0) { so->so_options |= SO_LINGER; } else { so->so_options &= ~SO_LINGER; } break; } case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_TIMESTAMP_MONOTONIC: case SO_TIMESTAMP_CONTINUOUS: case SO_DONTTRUNC: case SO_WANTMORE: case SO_WANTOOBFLAG: case SO_NOWAKEFROMSLEEP: case SO_NOAPNFALLBK: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval) { so->so_options |= sopt->sopt_name; } else { so->so_options &= ~sopt->sopt_name; } #if SKYWALK inp_update_netns_flags(so); #endif /* SKYWALK */ break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } /* * Values < 1 make no sense for any of these * options, so disallow them. */ if (optval < 1) { error = EINVAL; goto out; } switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: { struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ? &so->so_snd : &so->so_rcv; if (sbreserve(sb, (u_int32_t)optval) == 0) { error = ENOBUFS; goto out; } sb->sb_flags |= SB_USRSIZE; sb->sb_flags &= ~SB_AUTOSIZE; sb->sb_idealsize = (u_int32_t)optval; break; } /* * Make sure the low-water is never greater than * the high-water. */ case SO_SNDLOWAT: { int space = sbspace(&so->so_snd); uint32_t hiwat = so->so_snd.sb_hiwat; if (so->so_snd.sb_flags & SB_UNIX) { struct unpcb *unp = (struct unpcb *)(so->so_pcb); if (unp != NULL && unp->unp_conn != NULL) { struct socket *so2 = unp->unp_conn->unp_socket; hiwat += unp->unp_conn->unp_cc; space = sbspace(&so2->so_rcv); } } so->so_snd.sb_lowat = (optval > hiwat) ? hiwat : optval; if (space >= so->so_snd.sb_lowat) { sowwakeup(so); } break; } case SO_RCVLOWAT: { int64_t data_len; so->so_rcv.sb_lowat = (optval > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : optval; if (so->so_rcv.sb_flags & SB_UNIX) { struct unpcb *unp = (struct unpcb *)(so->so_pcb); if (unp != NULL && unp->unp_conn != NULL) { struct socket *so2 = unp->unp_conn->unp_socket; data_len = so2->so_snd.sb_cc - so2->so_snd.sb_ctl; } else { data_len = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; } } else { data_len = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; } if (data_len >= so->so_rcv.sb_lowat) { sorwakeup(so); } break; } } break; case SO_SNDTIMEO: case SO_RCVTIMEO: error = sooptcopyin_timeval(sopt, &tv); if (error != 0) { goto out; } switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = tv; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = tv; break; } break; case SO_NKE: { struct so_nke nke; error = sooptcopyin(sopt, &nke, sizeof(nke), sizeof(nke)); if (error != 0) { goto out; } error = sflt_attach_internal(so, nke.nke_handle); break; } case SO_NOSIGPIPE: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_NOSIGPIPE; } else { so->so_flags &= ~SOF_NOSIGPIPE; } break; case SO_NOADDRERR: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_NOADDRAVAIL; } else { so->so_flags &= ~SOF_NOADDRAVAIL; } break; case SO_REUSESHAREUID: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_REUSESHAREUID; } else { so->so_flags &= ~SOF_REUSESHAREUID; } break; case SO_NOTIFYCONFLICT: if (kauth_cred_issuser(kauth_cred_get()) == 0) { error = EPERM; goto out; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_NOTIFYCONFLICT; } else { so->so_flags &= ~SOF_NOTIFYCONFLICT; } break; case SO_RESTRICTIONS: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } error = so_set_restrictions(so, optval); break; case SO_AWDL_UNRESTRICTED: if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EOPNOTSUPP; goto out; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { error = soopt_cred_check(so, PRIV_NET_RESTRICTED_AWDL, false, false); if (error == 0) { inp_set_awdl_unrestricted( sotoinpcb(so)); } } else { inp_clear_awdl_unrestricted(sotoinpcb(so)); } break; case SO_INTCOPROC_ALLOW: if (SOCK_DOM(so) != PF_INET6) { error = EOPNOTSUPP; goto out; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0 && inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) { error = soopt_cred_check(so, PRIV_NET_RESTRICTED_INTCOPROC, false, false); if (error == 0) { inp_set_intcoproc_allowed( sotoinpcb(so)); } } else if (optval == 0) { inp_clear_intcoproc_allowed(sotoinpcb(so)); } break; case SO_LABEL: error = EOPNOTSUPP; break; case SO_UPCALLCLOSEWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_UPCALLCLOSEWAIT; } else { so->so_flags &= ~SOF_UPCALLCLOSEWAIT; } break; case SO_RANDOMPORT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval != 0) { so->so_flags |= SOF_BINDRANDOMPORT; } else { so->so_flags &= ~SOF_BINDRANDOMPORT; } break; case SO_NP_EXTENSIONS: { struct so_np_extensions sonpx; error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx)); if (error != 0) { goto out; } if (sonpx.npx_mask & ~SONPX_MASK_VALID) { error = EINVAL; goto out; } /* * Only one bit defined for now */ if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) { if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) { so->so_flags |= SOF_NPX_SETOPTSHUT; } else { so->so_flags &= ~SOF_NPX_SETOPTSHUT; } } break; } case SO_TRAFFIC_CLASS: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval >= SO_TC_NET_SERVICE_OFFSET) { int netsvc = optval - SO_TC_NET_SERVICE_OFFSET; error = so_set_net_service_type(so, netsvc); goto out; } error = so_set_traffic_class(so, optval); if (error != 0) { goto out; } so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE; so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC; break; } case SO_RECV_TRAFFIC_CLASS: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; } else { so->so_flags |= SOF_RECV_TRAFFIC_CLASS; } break; } #if (DEVELOPMENT || DEBUG) case SO_TRAFFIC_CLASS_DBG: { struct so_tcdbg so_tcdbg; error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg)); if (error != 0) { goto out; } error = so_set_tcdbg(so, &so_tcdbg); if (error != 0) { goto out; } break; } #endif /* (DEVELOPMENT || DEBUG) */ case SO_PRIVILEGED_TRAFFIC_CLASS: error = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0); if (error != 0) { goto out; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS; } else { so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS; } break; #if (DEVELOPMENT || DEBUG) case SO_DEFUNCTIT: error = sosetdefunct(current_proc(), so, 0, FALSE); if (error == 0) { error = sodefunct(current_proc(), so, 0); } break; #endif /* (DEVELOPMENT || DEBUG) */ case SO_DEFUNCTOK: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { if (error == 0) { error = EBADF; } goto out; } /* * Any process can set SO_DEFUNCTOK (clear * SOF_NODEFUNCT), but only root can clear * SO_DEFUNCTOK (set SOF_NODEFUNCT). */ if (optval == 0 && kauth_cred_issuser(kauth_cred_get()) == 0) { error = EPERM; goto out; } if (optval) { so->so_flags &= ~SOF_NODEFUNCT; } else { so->so_flags |= SOF_NODEFUNCT; } if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { char s[MAX_IPv6_STR_LEN]; char d[MAX_IPv6_STR_LEN]; struct inpcb *inp = sotoinpcb(so); SODEFUNCTLOG("%s[%d, %s]: so 0x%llu " "[%s %s:%d -> %s:%d] is now marked " "as %seligible for " "defunct\n", __func__, proc_selfpid(), proc_best_name(current_proc()), so->so_gencnt, (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), s, sizeof(s)), ntohs(inp->in6p_lport), inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr, d, sizeof(d)), ntohs(inp->in6p_fport), (so->so_flags & SOF_NODEFUNCT) ? "not " : ""); } else { SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] " "is now marked as %seligible for " "defunct\n", __func__, proc_selfpid(), proc_best_name(current_proc()), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), (so->so_flags & SOF_NODEFUNCT) ? "not " : ""); } break; case SO_ISDEFUNCT: /* This option is not settable */ error = EINVAL; break; case SO_OPPORTUNISTIC: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error == 0) { error = so_set_opportunistic(so, optval); } break; case SO_FLUSH: /* This option is handled by lower layer(s) */ error = 0; break; case SO_RECV_ANYIF: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error == 0) { error = so_set_recv_anyif(so, optval); } break; case SO_TRAFFIC_MGT_BACKGROUND: { /* This option is handled by lower layer(s) */ error = 0; break; } #if FLOW_DIVERT case SO_FLOW_DIVERT_TOKEN: error = flow_divert_token_set(so, sopt); break; #endif /* FLOW_DIVERT */ case SO_DELEGATED: if ((error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval))) != 0) { break; } error = so_set_effective_pid(so, optval, sopt->sopt_p, true); break; case SO_DELEGATED_UUID: { uuid_t euuid; if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid), sizeof(euuid))) != 0) { break; } error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true); break; } #if NECP case SO_NECP_ATTRIBUTES: if (SOCK_DOM(so) == PF_MULTIPATH) { /* Handled by MPTCP itself */ break; } if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } error = necp_set_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt); break; case SO_NECP_CLIENTUUID: { if (SOCK_DOM(so) == PF_MULTIPATH) { /* Handled by MPTCP itself */ break; } if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } struct inpcb *inp = sotoinpcb(so); if (!uuid_is_null(inp->necp_client_uuid)) { // Clear out the old client UUID if present necp_inpcb_remove_cb(inp); } error = sooptcopyin(sopt, &inp->necp_client_uuid, sizeof(uuid_t), sizeof(uuid_t)); if (error != 0) { goto out; } if (uuid_is_null(inp->necp_client_uuid)) { error = EINVAL; goto out; } pid_t current_pid = proc_pid(current_proc()); error = necp_client_register_socket_flow(current_pid, inp->necp_client_uuid, inp); if (error != 0) { uuid_clear(inp->necp_client_uuid); goto out; } if (inp->inp_lport != 0) { // There is a bound local port, so this is not // a fresh socket. Assign to the client. necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp); } break; } case SO_NECP_LISTENUUID: { if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } struct inpcb *inp = sotoinpcb(so); if (!uuid_is_null(inp->necp_client_uuid)) { error = EINVAL; goto out; } error = sooptcopyin(sopt, &inp->necp_client_uuid, sizeof(uuid_t), sizeof(uuid_t)); if (error != 0) { goto out; } if (uuid_is_null(inp->necp_client_uuid)) { error = EINVAL; goto out; } error = necp_client_register_socket_listener(proc_pid(current_proc()), inp->necp_client_uuid, inp); if (error != 0) { uuid_clear(inp->necp_client_uuid); goto out; } // Mark that the port registration is held by NECP inp->inp_flags2 |= INP2_EXTERNAL_PORT; break; } case SO_RESOLVER_SIGNATURE: { if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt); break; } #endif /* NECP */ case SO_EXTENDED_BK_IDLE: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error == 0) { error = so_set_extended_bk_idle(so, optval); } break; case SO_MARK_CELLFALLBACK: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval < 0) { error = EINVAL; goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_CELLFALLBACK; } else { so->so_flags1 |= SOF1_CELLFALLBACK; } break; case SO_MARK_CELLFALLBACK_UUID: { struct so_mark_cellfallback_uuid_args args; error = sooptcopyin(sopt, &args, sizeof(args), sizeof(args)); if (error != 0) { goto out; } error = nstat_userland_mark_rnf_override(args.flow_uuid, args.flow_cellfallback); break; } case SO_FALLBACK_MODE: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval < SO_FALLBACK_MODE_NONE || optval > SO_FALLBACK_MODE_PREFER) { error = EINVAL; goto out; } so->so_fallback_mode = (u_int8_t)optval; break; case SO_MARK_KNOWN_TRACKER: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval < 0) { error = EINVAL; goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_KNOWN_TRACKER; } else { so->so_flags1 |= SOF1_KNOWN_TRACKER; } break; } case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval < 0) { error = EINVAL; goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED; } else { so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED; } break; } case SO_MARK_APPROVED_APP_DOMAIN: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval < 0) { error = EINVAL; goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN; } else { so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN; } break; } case SO_STATISTICS_EVENT: error = sooptcopyin(sopt, &long_optval, sizeof(long_optval), sizeof(long_optval)); if (error != 0) { goto out; } u_int64_t nstat_event = 0; error = so_statistics_event_to_nstat_event( &long_optval, &nstat_event); if (error != 0) { goto out; } nstat_pcb_event(sotoinpcb(so), nstat_event); break; case SO_NET_SERVICE_TYPE: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } error = so_set_net_service_type(so, optval); break; } case SO_QOSMARKING_POLICY_OVERRIDE: error = priv_check_cred(kauth_cred_get(), PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0); if (error != 0) { goto out; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE; } else { so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE; } break; case SO_MPKL_SEND_INFO: { struct so_mpkl_send_info so_mpkl_send_info; error = sooptcopyin(sopt, &so_mpkl_send_info, sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info)); if (error != 0) { goto out; } uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid); so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto; if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) { so->so_flags1 &= ~SOF1_MPKL_SEND_INFO; } else { so->so_flags1 |= SOF1_MPKL_SEND_INFO; } break; } case SO_WANT_KEV_SOCKET_CLOSED: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED; } else { so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED; } break; } case SO_MARK_WAKE_PKT: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags &= ~SOF_MARK_WAKE_PKT; } else { so->so_flags |= SOF_MARK_WAKE_PKT; } break; } case SO_RECV_WAKE_PKT: { error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) { goto out; } if (optval == 0) { so->so_flags &= ~SOF_RECV_WAKE_PKT; } else { so->so_flags |= SOF_RECV_WAKE_PKT; } break; } case SO_APPLICATION_ID: { so_application_id_t application_id = { 0 }; if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } error = sooptcopyin(sopt, &application_id, sizeof(application_id), sizeof(application_id)); if (error != 0) { goto out; } // The user needs to match if (kauth_cred_getuid(so->so_cred) != application_id.uid) { error = EINVAL; printf("setsockopt: SO_APPLICATION_ID - wrong uid"); goto out; } error = so_set_effective_uuid(so, application_id.effective_uuid, sopt->sopt_p, true); if (error != 0) { printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid"); goto out; } if (application_id.persona_id != PERSONA_ID_NONE) { so->so_persona_id = application_id.persona_id; } break; } default: error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { (void) so->so_proto->pr_ctloutput(so, sopt); } } out: if (dolock) { socket_unlock(so, 1); } return error; } /* Helper routines for getsockopt */ int sooptcopyout(struct sockopt *sopt, void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, * possibly truncated to fit in the user's buffer. * Traditional behavior is that we always tell the user * precisely how much we copied, rather than something useful * like the total amount we had available for her. * Note that this interface is not idempotent; the entire answer must * generated ahead of time. */ valsize = MIN(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != USER_ADDR_NULL) { if (sopt->sopt_p != kernproc) { error = copyout(buf, sopt->sopt_val, valsize); } else { bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); } } return error; } static int sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p) { int error; size_t len; struct user64_timeval tv64 = {}; struct user32_timeval tv32 = {}; const void * val; size_t valsize; error = 0; if (proc_is64bit(sopt->sopt_p)) { len = sizeof(tv64); tv64.tv_sec = tv_p->tv_sec; tv64.tv_usec = tv_p->tv_usec; val = &tv64; } else { len = sizeof(tv32); tv32.tv_sec = (user32_time_t)tv_p->tv_sec; tv32.tv_usec = tv_p->tv_usec; val = &tv32; } valsize = MIN(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != USER_ADDR_NULL) { if (sopt->sopt_p != kernproc) { error = copyout(val, sopt->sopt_val, valsize); } else { bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); } } return error; } /* * Return: 0 Success * ENOPROTOOPT * :EOPNOTSUPP[AF_UNIX] * :??? * :??? */ int sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) { int error, optval; struct linger l; struct timeval tv; if (sopt->sopt_dir != SOPT_GET) { sopt->sopt_dir = SOPT_GET; } if (dolock) { socket_lock(so, 1); } error = sflt_getsockopt(so, sopt); if (error != 0) { if (error == EJUSTRETURN) { error = 0; } goto out; } if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { error = (*so->so_proto->pr_ctloutput)(so, sopt); goto out; } error = ENOPROTOOPT; } else { /* * Allow socket-level (SOL_SOCKET) options to be filtered by * the protocol layer, if needed. A zero value returned from * the handler means use default socket-level processing as * done by the rest of this routine. Otherwise, any other * return value indicates that the option is unsupported. */ if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs-> pru_socheckopt(so, sopt)) != 0) { goto out; } error = 0; switch (sopt->sopt_name) { case SO_LINGER: case SO_LINGER_SEC: l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0); l.l_linger = (sopt->sopt_name == SO_LINGER) ? so->so_linger : so->so_linger / hz; error = sooptcopyout(sopt, &l, sizeof(l)); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_TIMESTAMP_MONOTONIC: case SO_TIMESTAMP_CONTINUOUS: case SO_DONTTRUNC: case SO_WANTMORE: case SO_WANTOOBFLAG: case SO_NOWAKEFROMSLEEP: case SO_NOAPNFALLBK: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case SO_TYPE: optval = so->so_type; goto integer; case SO_NREAD: if (so->so_proto->pr_flags & PR_ATOMIC) { int pkt_total; struct mbuf *m1; pkt_total = 0; m1 = so->so_rcv.sb_mb; while (m1 != NULL) { if (m_has_mtype(m1, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { pkt_total += m1->m_len; } m1 = m1->m_next; } optval = pkt_total; } else { optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; } goto integer; case SO_NUMRCVPKT: if (so->so_proto->pr_flags & PR_ATOMIC) { int cnt = 0; struct mbuf *m1; m1 = so->so_rcv.sb_mb; while (m1 != NULL) { cnt += 1; m1 = m1->m_nextpkt; } optval = cnt; goto integer; } else { error = ENOPROTOOPT; break; } case SO_NWRITE: optval = so->so_snd.sb_cc; goto integer; case SO_ERROR: optval = so->so_error; so->so_error = 0; goto integer; case SO_SNDBUF: { u_int32_t hiwat = so->so_snd.sb_hiwat; if (so->so_snd.sb_flags & SB_UNIX) { struct unpcb *unp = (struct unpcb *)(so->so_pcb); if (unp != NULL && unp->unp_conn != NULL) { hiwat += unp->unp_conn->unp_cc; } } optval = hiwat; goto integer; } case SO_RCVBUF: optval = so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = (sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); error = sooptcopyout_timeval(sopt, &tv); break; case SO_NOSIGPIPE: optval = (so->so_flags & SOF_NOSIGPIPE); goto integer; case SO_NOADDRERR: optval = (so->so_flags & SOF_NOADDRAVAIL); goto integer; case SO_REUSESHAREUID: optval = (so->so_flags & SOF_REUSESHAREUID); goto integer; case SO_NOTIFYCONFLICT: optval = (so->so_flags & SOF_NOTIFYCONFLICT); goto integer; case SO_RESTRICTIONS: optval = so_get_restrictions(so); goto integer; case SO_AWDL_UNRESTRICTED: if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { optval = inp_get_awdl_unrestricted( sotoinpcb(so)); goto integer; } else { error = EOPNOTSUPP; } break; case SO_INTCOPROC_ALLOW: if (SOCK_DOM(so) == PF_INET6) { optval = inp_get_intcoproc_allowed( sotoinpcb(so)); goto integer; } else { error = EOPNOTSUPP; } break; case SO_LABEL: error = EOPNOTSUPP; break; case SO_PEERLABEL: error = EOPNOTSUPP; break; #ifdef __APPLE_API_PRIVATE case SO_UPCALLCLOSEWAIT: optval = (so->so_flags & SOF_UPCALLCLOSEWAIT); goto integer; #endif case SO_RANDOMPORT: optval = (so->so_flags & SOF_BINDRANDOMPORT); goto integer; case SO_NP_EXTENSIONS: { struct so_np_extensions sonpx = {}; sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0; sonpx.npx_mask = SONPX_MASK_VALID; error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions)); break; } case SO_TRAFFIC_CLASS: optval = so->so_traffic_class; goto integer; case SO_RECV_TRAFFIC_CLASS: optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); goto integer; #if (DEVELOPMENT || DEBUG) case SO_TRAFFIC_CLASS_DBG: error = sogetopt_tcdbg(so, sopt); break; #endif /* (DEVELOPMENT || DEBUG) */ case SO_PRIVILEGED_TRAFFIC_CLASS: optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); goto integer; case SO_DEFUNCTOK: optval = !(so->so_flags & SOF_NODEFUNCT); goto integer; case SO_ISDEFUNCT: optval = (so->so_flags & SOF_DEFUNCT); goto integer; case SO_OPPORTUNISTIC: optval = so_get_opportunistic(so); goto integer; case SO_FLUSH: /* This option is not gettable */ error = EINVAL; break; case SO_RECV_ANYIF: optval = so_get_recv_anyif(so); goto integer; case SO_TRAFFIC_MGT_BACKGROUND: /* This option is handled by lower layer(s) */ if (so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { (void) so->so_proto->pr_ctloutput(so, sopt); } break; #if FLOW_DIVERT case SO_FLOW_DIVERT_TOKEN: error = flow_divert_token_get(so, sopt); break; #endif /* FLOW_DIVERT */ #if NECP case SO_NECP_ATTRIBUTES: if (SOCK_DOM(so) == PF_MULTIPATH) { /* Handled by MPTCP itself */ break; } if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } error = necp_get_socket_attributes(&sotoinpcb(so)->inp_necp_attributes, sopt); break; case SO_NECP_CLIENTUUID: { uuid_t *ncu; if (SOCK_DOM(so) == PF_MULTIPATH) { ncu = &mpsotomppcb(so)->necp_client_uuid; } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { ncu = &sotoinpcb(so)->necp_client_uuid; } else { error = EINVAL; goto out; } error = sooptcopyout(sopt, ncu, sizeof(uuid_t)); break; } case SO_NECP_LISTENUUID: { uuid_t *nlu; if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) { nlu = &sotoinpcb(so)->necp_client_uuid; } else { error = ENOENT; goto out; } } else { error = EINVAL; goto out; } error = sooptcopyout(sopt, nlu, sizeof(uuid_t)); break; } case SO_RESOLVER_SIGNATURE: { if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt); break; } #endif /* NECP */ #if CONTENT_FILTER case SO_CFIL_SOCK_ID: { cfil_sock_id_t sock_id; sock_id = cfil_sock_id_from_socket(so); error = sooptcopyout(sopt, &sock_id, sizeof(cfil_sock_id_t)); break; } #endif /* CONTENT_FILTER */ case SO_EXTENDED_BK_IDLE: optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED); goto integer; case SO_MARK_CELLFALLBACK: optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0) ? 1 : 0; goto integer; case SO_FALLBACK_MODE: optval = so->so_fallback_mode; goto integer; case SO_MARK_KNOWN_TRACKER: { optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0) ? 1 : 0; goto integer; } case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: { optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0) ? 1 : 0; goto integer; } case SO_MARK_APPROVED_APP_DOMAIN: { optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0) ? 1 : 0; goto integer; } case SO_NET_SERVICE_TYPE: { if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) { optval = so->so_netsvctype; } else { optval = NET_SERVICE_TYPE_BE; } goto integer; } case SO_NETSVC_MARKING_LEVEL: optval = so_get_netsvc_marking_level(so); goto integer; case SO_MPKL_SEND_INFO: { struct so_mpkl_send_info so_mpkl_send_info; uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid); so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto; error = sooptcopyout(sopt, &so_mpkl_send_info, sizeof(struct so_mpkl_send_info)); break; } case SO_MARK_WAKE_PKT: optval = (so->so_flags & SOF_MARK_WAKE_PKT); goto integer; case SO_RECV_WAKE_PKT: optval = (so->so_flags & SOF_RECV_WAKE_PKT); goto integer; case SO_APPLICATION_ID: { if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { error = EINVAL; goto out; } so_application_id_t application_id = { 0 }; application_id.uid = kauth_cred_getuid(so->so_cred); uuid_copy(application_id.effective_uuid, !uuid_is_null(so->e_uuid) ? so->e_uuid : so->last_uuid); application_id.persona_id = so->so_persona_id; error = sooptcopyout(sopt, &application_id, sizeof(so_application_id_t)); break; } default: error = ENOPROTOOPT; break; } } out: if (dolock) { socket_unlock(so, 1); } return error; } /* * The size limits on our soopt_getm is different from that on FreeBSD. * We limit the size of options to MCLBYTES. This will have to change * if we need to define options that need more space than MCLBYTES. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = (int)sopt->sopt_valsize; int how; if (sopt_size <= 0 || sopt_size > MCLBYTES) { return EMSGSIZE; } how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; MGET(m, how, MT_DATA); if (m == NULL) { return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size > 0) { MGET(m, how, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_freem(*mp); m_freem(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return 0; } /* copyin sopt data into mbuf chain */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == USER_ADDR_NULL) { return 0; } while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_p != kernproc) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return error; } } else { bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), mtod(m, char *), m->m_len); } sopt->sopt_valsize -= m->m_len; sopt->sopt_val += m->m_len; m = m->m_next; } /* should be allocated enoughly at ip6_sooptmcopyin() */ if (m != NULL) { panic("soopt_mcopyin"); /* NOTREACHED */ } return 0; } /* copyout mbuf chain data into soopt */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == USER_ADDR_NULL) { return 0; } while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_p != kernproc) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return error; } } else { bcopy(mtod(m, char *), CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len); } sopt->sopt_valsize -= m->m_len; sopt->sopt_val += m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return EINVAL; } sopt->sopt_valsize = valsize; return 0; } void sohasoutofband(struct socket *so) { if (so->so_pgid < 0) { gsignal(-so->so_pgid, SIGURG); } else if (so->so_pgid > 0) { proc_signal(so->so_pgid, SIGURG); } selwakeup(&so->so_rcv.sb_sel); if (so->so_rcv.sb_flags & SB_KNOTE) { KNOTE(&so->so_rcv.sb_sel.si_note, (NOTE_OOB | SO_FILT_HINT_LOCKED)); } } int sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql) { #pragma unused(cred) struct proc *p = current_proc(); int revents = 0; socket_lock(so, 1); so_update_last_owner_locked(so, PROC_NULL); so_update_policy(so); if (events & (POLLIN | POLLRDNORM)) { if (soreadable(so)) { revents |= events & (POLLIN | POLLRDNORM); } } if (events & (POLLOUT | POLLWRNORM)) { if (sowriteable(so)) { revents |= events & (POLLOUT | POLLWRNORM); } } if (events & (POLLPRI | POLLRDBAND)) { if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { revents |= events & (POLLPRI | POLLRDBAND); } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { /* * Darwin sets the flag first, * BSD calls selrecord first */ so->so_rcv.sb_flags |= SB_SEL; selrecord(p, &so->so_rcv.sb_sel, wql); } if (events & (POLLOUT | POLLWRNORM)) { /* * Darwin sets the flag first, * BSD calls selrecord first */ so->so_snd.sb_flags |= SB_SEL; selrecord(p, &so->so_snd.sb_sel, wql); } } socket_unlock(so, 1); return revents; } int soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(fp); int result; socket_lock(so, 1); so_update_last_owner_locked(so, PROC_NULL); so_update_policy(so); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_filtid = EVFILTID_SOREAD; break; case EVFILT_WRITE: kn->kn_filtid = EVFILTID_SOWRITE; break; case EVFILT_SOCK: kn->kn_filtid = EVFILTID_SCK; break; case EVFILT_EXCEPT: kn->kn_filtid = EVFILTID_SOEXCEPT; break; default: socket_unlock(so, 1); knote_set_error(kn, EINVAL); return 0; } /* * call the appropriate sub-filter attach * with the socket still locked */ result = knote_fops(kn)->f_attach(kn, kev); socket_unlock(so, 1); return result; } static int filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so) { int retval = 0; int64_t data = 0; if (so->so_options & SO_ACCEPTCONN) { /* * Radar 6615193 handle the listen case dynamically * for kqueue read filter. This allows to call listen() * after registering the kqueue EVFILT_READ. */ retval = !TAILQ_EMPTY(&so->so_comp); data = so->so_qlen; goto out; } /* socket isn't a listener */ /* * NOTE_LOWAT specifies new low water mark in data, i.e. * the bytes of protocol data. We therefore exclude any * control bytes. */ data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; if (kn->kn_sfflags & NOTE_OOB) { if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { kn->kn_fflags |= NOTE_OOB; data -= so->so_oobmark; retval = 1; goto out; } } if ((so->so_state & SS_CANTRCVMORE) #if CONTENT_FILTER && cfil_sock_data_pending(&so->so_rcv) == 0 #endif /* CONTENT_FILTER */ ) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; retval = 1; goto out; } if (so->so_error) { /* temporary udp error */ retval = 1; goto out; } int64_t lowwat = so->so_rcv.sb_lowat; /* * Ensure that when NOTE_LOWAT is used, the derived * low water mark is bounded by socket's rcv buf's * high and low water mark values. */ if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_sdata > so->so_rcv.sb_hiwat) { lowwat = so->so_rcv.sb_hiwat; } else if (kn->kn_sdata > lowwat) { lowwat = kn->kn_sdata; } } /* * While the `data` field is the amount of data to read, * 0-sized packets need to wake up the kqueue, see 58140856, * so we need to take control bytes into account too. */ retval = (so->so_rcv.sb_cc >= lowwat); out: if (retval && kev) { knote_fill_kevent(kn, kev, data); } return retval; } static int filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); /* socket locked */ /* * If the caller explicitly asked for OOB results (e.g. poll()) * from EVFILT_READ, then save that off in the hookid field * and reserve the kn_flags EV_OOBAND bit for output only. */ if (kn->kn_filter == EVFILT_READ && kn->kn_flags & EV_OOBAND) { kn->kn_flags &= ~EV_OOBAND; kn->kn_hook32 = EV_OOBAND; } else { kn->kn_hook32 = 0; } if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) { so->so_rcv.sb_flags |= SB_KNOTE; } /* indicate if event is already fired */ return filt_soread_common(kn, NULL, so); } static void filt_sordetach(struct knote *kn) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); socket_lock(so, 1); if (so->so_rcv.sb_flags & SB_KNOTE) { if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) { so->so_rcv.sb_flags &= ~SB_KNOTE; } } socket_unlock(so, 1); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int retval; if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_lock(so, 1); } retval = filt_soread_common(kn, NULL, so); if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_unlock(so, 1); } return retval; } static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int retval; socket_lock(so, 1); /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; /* determine if changes result in fired events */ retval = filt_soread_common(kn, NULL, so); socket_unlock(so, 1); return retval; } static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int retval; socket_lock(so, 1); retval = filt_soread_common(kn, kev, so); socket_unlock(so, 1); return retval; } int so_wait_for_if_feedback(struct socket *so) { if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) && (so->so_state & SS_ISCONNECTED)) { struct inpcb *inp = sotoinpcb(so); if (INP_WAIT_FOR_IF_FEEDBACK(inp)) { return 1; } } return 0; } static int filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so) { int ret = 0; int64_t data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; ret = 1; goto out; } if (so->so_error) { /* temporary udp error */ ret = 1; goto out; } if (!socanwrite(so)) { ret = 0; goto out; } if (so->so_flags1 & SOF1_PRECONNECT_DATA) { ret = 1; goto out; } int64_t lowwat = so->so_snd.sb_lowat; const int64_t hiwat = so->so_snd.sb_hiwat; /* * Deal with connected UNIX domain sockets which * rely on the fact that the sender's socket buffer is * actually the receiver's socket buffer. */ if (SOCK_DOM(so) == PF_LOCAL) { struct unpcb *unp = sotounpcb(so); if (unp != NULL && unp->unp_conn != NULL && unp->unp_conn->unp_socket != NULL) { struct socket *so2 = unp->unp_conn->unp_socket; /* * At this point we know that `so' is locked * and that `unp_conn` isn't going to change. * However, we don't lock `so2` because doing so * may require unlocking `so' * (see unp_get_locks_in_order()). * * Two cases can happen: * * 1) we return 1 and tell the application that * it can write. Meanwhile, another thread * fills up the socket buffer. This will either * lead to a blocking send or EWOULDBLOCK * which the application should deal with. * 2) we return 0 and tell the application that * the socket is not writable. Meanwhile, * another thread depletes the receive socket * buffer. In this case the application will * be woken up by sb_notify(). * * MIN() is required because otherwise sosendcheck() * may return EWOULDBLOCK since it only considers * so->so_snd. */ data = MIN(data, sbspace(&so2->so_rcv)); } } if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_sdata > hiwat) { lowwat = hiwat; } else if (kn->kn_sdata > lowwat) { lowwat = kn->kn_sdata; } } if (data > 0 && data >= lowwat) { if ((so->so_flags & SOF_NOTSENT_LOWAT) #if (DEBUG || DEVELOPMENT) && so_notsent_lowat_check == 1 #endif /* DEBUG || DEVELOPMENT */ ) { if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) && so->so_type == SOCK_STREAM) { ret = tcp_notsent_lowat_check(so); } #if MPTCP else if ((SOCK_DOM(so) == PF_MULTIPATH) && (SOCK_PROTO(so) == IPPROTO_TCP)) { ret = mptcp_notsent_lowat_check(so); } #endif else { ret = 1; goto out; } } else { ret = 1; } } if (so_wait_for_if_feedback(so)) { ret = 0; } out: if (ret && kev) { knote_fill_kevent(kn, kev, data); } return ret; } static int filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); /* socket locked */ if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) { so->so_snd.sb_flags |= SB_KNOTE; } /* determine if its already fired */ return filt_sowrite_common(kn, NULL, so); } static void filt_sowdetach(struct knote *kn) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); socket_lock(so, 1); if (so->so_snd.sb_flags & SB_KNOTE) { if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) { so->so_snd.sb_flags &= ~SB_KNOTE; } } socket_unlock(so, 1); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int ret; if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_lock(so, 1); } ret = filt_sowrite_common(kn, NULL, so); if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_unlock(so, 1); } return ret; } static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int ret; socket_lock(so, 1); /*save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; /* determine if these changes result in a triggered event */ ret = filt_sowrite_common(kn, NULL, so); socket_unlock(so, 1); return ret; } static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int ret; socket_lock(so, 1); ret = filt_sowrite_common(kn, kev, so); socket_unlock(so, 1); return ret; } static int filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so, long ev_hint) { int ret = 0; int64_t data = 0; uint32_t level_trigger = 0; if (ev_hint & SO_FILT_HINT_CONNRESET) { kn->kn_fflags |= NOTE_CONNRESET; } if (ev_hint & SO_FILT_HINT_TIMEOUT) { kn->kn_fflags |= NOTE_TIMEOUT; } if (ev_hint & SO_FILT_HINT_NOSRCADDR) { kn->kn_fflags |= NOTE_NOSRCADDR; } if (ev_hint & SO_FILT_HINT_IFDENIED) { kn->kn_fflags |= NOTE_IFDENIED; } if (ev_hint & SO_FILT_HINT_KEEPALIVE) { kn->kn_fflags |= NOTE_KEEPALIVE; } if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) { kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO; } if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) { kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO; } if ((ev_hint & SO_FILT_HINT_CONNECTED) || (so->so_state & SS_ISCONNECTED)) { kn->kn_fflags |= NOTE_CONNECTED; level_trigger |= NOTE_CONNECTED; } if ((ev_hint & SO_FILT_HINT_DISCONNECTED) || (so->so_state & SS_ISDISCONNECTED)) { kn->kn_fflags |= NOTE_DISCONNECTED; level_trigger |= NOTE_DISCONNECTED; } if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) { if (so->so_proto != NULL && (so->so_proto->pr_flags & PR_EVCONNINFO)) { kn->kn_fflags |= NOTE_CONNINFO_UPDATED; } } if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) || tcp_notify_ack_active(so)) { kn->kn_fflags |= NOTE_NOTIFY_ACK; } if (ev_hint & SO_FILT_HINT_WAKE_PKT) { kn->kn_fflags |= NOTE_WAKE_PKT; } if ((so->so_state & SS_CANTRCVMORE) #if CONTENT_FILTER && cfil_sock_data_pending(&so->so_rcv) == 0 #endif /* CONTENT_FILTER */ ) { kn->kn_fflags |= NOTE_READCLOSED; level_trigger |= NOTE_READCLOSED; } if (so->so_state & SS_CANTSENDMORE) { kn->kn_fflags |= NOTE_WRITECLOSED; level_trigger |= NOTE_WRITECLOSED; } if ((ev_hint & SO_FILT_HINT_SUSPEND) || (so->so_flags & SOF_SUSPENDED)) { kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); /* If resume event was delivered before, reset it */ kn->kn_hook32 &= ~NOTE_RESUME; kn->kn_fflags |= NOTE_SUSPEND; level_trigger |= NOTE_SUSPEND; } if ((ev_hint & SO_FILT_HINT_RESUME) || (so->so_flags & SOF_SUSPENDED) == 0) { kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME); /* If suspend event was delivered before, reset it */ kn->kn_hook32 &= ~NOTE_SUSPEND; kn->kn_fflags |= NOTE_RESUME; level_trigger |= NOTE_RESUME; } if (so->so_error != 0) { ret = 1; data = so->so_error; kn->kn_flags |= EV_EOF; } else { u_int32_t data32 = 0; get_sockev_state(so, &data32); data = data32; } /* Reset any events that are not requested on this knote */ kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK); level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK); /* Find the level triggerred events that are already delivered */ level_trigger &= kn->kn_hook32; level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK; /* Do not deliver level triggerred events more than once */ if ((kn->kn_fflags & ~level_trigger) != 0) { ret = 1; } if (ret && kev) { /* * Store the state of the events being delivered. This * state can be used to deliver level triggered events * ateast once and still avoid waking up the application * multiple times as long as the event is active. */ if (kn->kn_fflags != 0) { kn->kn_hook32 |= (kn->kn_fflags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); } /* * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver * only one of them and remember the last one that was * delivered last */ if (kn->kn_fflags & NOTE_SUSPEND) { kn->kn_hook32 &= ~NOTE_RESUME; } if (kn->kn_fflags & NOTE_RESUME) { kn->kn_hook32 &= ~NOTE_SUSPEND; } knote_fill_kevent(kn, kev, data); } return ret; } static int filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); /* socket locked */ kn->kn_hook32 = 0; if (KNOTE_ATTACH(&so->so_klist, kn)) { so->so_flags |= SOF_KNOTE; } /* determine if event already fired */ return filt_sockev_common(kn, NULL, so, 0); } static void filt_sockdetach(struct knote *kn) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); socket_lock(so, 1); if ((so->so_flags & SOF_KNOTE) != 0) { if (KNOTE_DETACH(&so->so_klist, kn)) { so->so_flags &= ~SOF_KNOTE; } } socket_unlock(so, 1); } static int filt_sockev(struct knote *kn, long hint) { int ret = 0, locked = 0; struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); long ev_hint = (hint & SO_FILT_HINT_EV); if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_lock(so, 1); locked = 1; } ret = filt_sockev_common(kn, NULL, so, ev_hint); if (locked) { socket_unlock(so, 1); } return ret; } /* * filt_socktouch - update event state */ static int filt_socktouch( struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); uint32_t changed_flags; int ret; socket_lock(so, 1); /* save off the [result] data and fflags */ changed_flags = (kn->kn_sfflags ^ kn->kn_hook32); /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; /* restrict the current results to the (smaller?) set of new interest */ /* * For compatibility with previous implementations, we leave kn_fflags * as they were before. */ //kn->kn_fflags &= kev->fflags; /* * Since we keep track of events that are already * delivered, if any of those events are not requested * anymore the state related to them can be reset */ kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK); /* determine if we have events to deliver */ ret = filt_sockev_common(kn, NULL, so, 0); socket_unlock(so, 1); return ret; } /* * filt_sockprocess - query event fired state and return data */ static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev) { struct socket *so = (struct socket *)fp_get_data(kn->kn_fp); int ret = 0; socket_lock(so, 1); ret = filt_sockev_common(kn, kev, so, 0); socket_unlock(so, 1); return ret; } void get_sockev_state(struct socket *so, u_int32_t *statep) { u_int32_t state = *(statep); /* * If the state variable is already used by a previous event, * reset it. */ if (state != 0) { return; } if (so->so_state & SS_ISCONNECTED) { state |= SOCKEV_CONNECTED; } else { state &= ~(SOCKEV_CONNECTED); } state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0); *(statep) = state; } #define SO_LOCK_HISTORY_STR_LEN \ (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1) __private_extern__ const char * solockhistory_nr(struct socket *so) { size_t n = 0; int i; static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; bzero(lock_history_str, sizeof(lock_history_str)); for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { n += scnprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ", so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]); } return lock_history_str; } lck_mtx_t * socket_getlock(struct socket *so, int flags) { if (so->so_proto->pr_getlock != NULL) { return (*so->so_proto->pr_getlock)(so, flags); } else { return so->so_proto->pr_domain->dom_mtx; } } void socket_lock(struct socket *so, int refcount) { void *lr_saved; lr_saved = __builtin_return_address(0); if (so->so_proto->pr_lock) { (*so->so_proto->pr_lock)(so, refcount, lr_saved); } else { #ifdef MORE_LOCKING_DEBUG LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx, LCK_MTX_ASSERT_NOTOWNED); #endif lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); if (refcount) { so->so_usecount++; } so->lock_lr[so->next_lock_lr] = lr_saved; so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX; } } void socket_lock_assert_owned(struct socket *so) { lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, 0); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); } int socket_try_lock(struct socket *so) { lck_mtx_t *mtx; if (so->so_proto->pr_getlock != NULL) { mtx = (*so->so_proto->pr_getlock)(so, 0); } else { mtx = so->so_proto->pr_domain->dom_mtx; } return lck_mtx_try_lock(mtx); } void socket_unlock(struct socket *so, int refcount) { void *lr_saved; lck_mtx_t *mutex_held; lr_saved = __builtin_return_address(0); if (so == NULL || so->so_proto == NULL) { panic("%s: null so_proto so=%p", __func__, so); /* NOTREACHED */ } if (so->so_proto->pr_unlock) { (*so->so_proto->pr_unlock)(so, refcount, lr_saved); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; #ifdef MORE_LOCKING_DEBUG LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); #endif so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX; if (refcount) { if (so->so_usecount <= 0) { panic("%s: bad refcount=%d so=%p (%d, %d, %d) " "lrh=%s", __func__, so->so_usecount, so, SOCK_DOM(so), so->so_type, SOCK_PROTO(so), solockhistory_nr(so)); /* NOTREACHED */ } so->so_usecount--; if (so->so_usecount == 0) { sofreelastref(so, 1); } } lck_mtx_unlock(mutex_held); } } /* Called with socket locked, will unlock socket */ void sofree(struct socket *so) { lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, 0); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); sofreelastref(so, 0); } void soreference(struct socket *so) { socket_lock(so, 1); /* locks & take one reference on socket */ socket_unlock(so, 0); /* unlock only */ } void sodereference(struct socket *so) { socket_lock(so, 0); socket_unlock(so, 1); } /* * Set or clear SOF_MULTIPAGES on the socket to enable or disable the * possibility of using jumbo clusters. Caller must ensure to hold * the socket lock. */ void somultipages(struct socket *so, boolean_t set) { if (set) { so->so_flags |= SOF_MULTIPAGES; } else { so->so_flags &= ~SOF_MULTIPAGES; } } void soif2kcl(struct socket *so, boolean_t set) { if (set) { so->so_flags1 |= SOF1_IF_2KCL; } else { so->so_flags1 &= ~SOF1_IF_2KCL; } } int so_isdstlocal(struct socket *so) { struct inpcb *inp = (struct inpcb *)so->so_pcb; if (SOCK_DOM(so) == PF_INET) { return inaddr_local(inp->inp_faddr); } else if (SOCK_DOM(so) == PF_INET6) { return in6addr_local(&inp->in6p_faddr); } return 0; } int sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) { struct sockbuf *rcv, *snd; int err = 0, defunct; rcv = &so->so_rcv; snd = &so->so_snd; defunct = (so->so_flags & SOF_DEFUNCT); if (defunct) { if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) { panic("%s: SB_DROP not set", __func__); /* NOTREACHED */ } goto done; } if (so->so_flags & SOF_NODEFUNCT) { if (noforce) { err = EOPNOTSUPP; if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d " "name %s level %d) so 0x%llu [%d,%d] " "is not eligible for defunct " "(%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), err); } return err; } so->so_flags &= ~SOF_NODEFUNCT; if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d " "name %s level %d) so 0x%llu [%d,%d] " "defunct by force " "(%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), err); } } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) { struct inpcb *inp = (struct inpcb *)so->so_pcb; struct ifnet *ifp = inp->inp_last_outifp; if (ifp && IFNET_IS_CELLULAR(ifp)) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell); } else if (so->so_flags & SOF_DELEGATED) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd); } else if (soextbkidlestat.so_xbkidle_time == 0) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime); } else if (noforce && p != PROC_NULL) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active); so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG; so->so_extended_bk_start = net_uptime(); OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag); inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY); err = EOPNOTSUPP; SODEFUNCTLOG("%s[%d, %s]: (target pid %d " "name %s level %d) so 0x%llu [%d,%d] " "extend bk idle " "(%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), err); return err; } else { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced); } } so->so_flags |= SOF_DEFUNCT; /* Prevent further data from being appended to the socket buffers */ snd->sb_flags |= SB_DROP; rcv->sb_flags |= SB_DROP; /* Flush any existing data in the socket buffers */ if (rcv->sb_cc != 0) { rcv->sb_flags &= ~SB_SEL; selthreadclear(&rcv->sb_sel); sbrelease(rcv); } if (snd->sb_cc != 0) { snd->sb_flags &= ~SB_SEL; selthreadclear(&snd->sb_sel); sbrelease(snd); } done: if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " "so 0x%llu [%d,%d] %s defunct%s\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), defunct ? "is already" : "marked as", (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""); } return err; } int sodefunct(struct proc *p, struct socket *so, int level) { struct sockbuf *rcv, *snd; if (!(so->so_flags & SOF_DEFUNCT)) { panic("%s improperly called", __func__); /* NOTREACHED */ } if (so->so_state & SS_DEFUNCT) { goto done; } rcv = &so->so_rcv; snd = &so->so_snd; if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { char s[MAX_IPv6_STR_LEN]; char d[MAX_IPv6_STR_LEN]; struct inpcb *inp = sotoinpcb(so); if (p != PROC_NULL) { SODEFUNCTLOG( "%s[%d, %s]: (target pid %d name %s level %d) " "so 0x%llu [%s %s:%d -> %s:%d] is now defunct " "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, " " snd_fl 0x%x]\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), s, sizeof(s)), ntohs(inp->in6p_lport), inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr, d, sizeof(d)), ntohs(inp->in6p_fport), (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags, snd->sb_flags); } } else if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, " "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags, snd->sb_flags); } /* * First tell the protocol the flow is defunct */ (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so); /* * Unwedge threads blocked on sbwait() and sb_lock(). */ sbwakeup(rcv); sbwakeup(snd); so->so_flags1 |= SOF1_DEFUNCTINPROG; if (rcv->sb_flags & SB_LOCK) { sbunlock(rcv, TRUE); /* keep socket locked */ } if (snd->sb_flags & SB_LOCK) { sbunlock(snd, TRUE); /* keep socket locked */ } /* * Flush the buffers and disconnect. We explicitly call shutdown * on both data directions to ensure that SS_CANT{RCV,SEND}MORE * states are set for the socket. This would also flush out data * hanging off the receive list of this socket. */ (void) soshutdownlock_final(so, SHUT_RD); (void) soshutdownlock_final(so, SHUT_WR); (void) sodisconnectlocked(so); /* * Explicitly handle connectionless-protocol disconnection * and release any remaining data in the socket buffers. */ if (!(so->so_state & SS_ISDISCONNECTED)) { (void) soisdisconnected(so); } if (so->so_error == 0) { so->so_error = EBADF; } if (rcv->sb_cc != 0) { rcv->sb_flags &= ~SB_SEL; selthreadclear(&rcv->sb_sel); sbrelease(rcv); } if (snd->sb_cc != 0) { snd->sb_flags &= ~SB_SEL; selthreadclear(&snd->sb_sel); sbrelease(snd); } so->so_state |= SS_DEFUNCT; OSIncrementAtomicLong((volatile long *)&sodefunct_calls); done: return 0; } int soresume(struct proc *p, struct socket *so, int locked) { if (locked == 0) { socket_lock(so, 1); } if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu " "[%d,%d] resumed from bk idle\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so)); so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG; so->so_extended_bk_start = 0; OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag); OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed); OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active); VERIFY(soextbkidlestat.so_xbkidle_active >= 0); } if (locked == 0) { socket_unlock(so, 1); } return 0; } /* * Does not attempt to account for sockets that are delegated from * the current process */ int so_set_extended_bk_idle(struct socket *so, int optval) { int error = 0; if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || SOCK_PROTO(so) != IPPROTO_TCP) { OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp); error = EOPNOTSUPP; } else if (optval == 0) { so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED; soresume(current_proc(), so, 1); } else { struct proc *p = current_proc(); struct fileproc *fp; int count = 0; /* * Unlock socket to avoid lock ordering issue with * the proc fd table lock */ socket_unlock(so, 0); proc_fdlock(p); fdt_foreach(fp, p) { struct socket *so2; if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) { continue; } so2 = (struct socket *)fp_get_data(fp); if (so != so2 && so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) { count++; } if (count >= soextbkidlestat.so_xbkidle_maxperproc) { break; } } proc_fdunlock(p); socket_lock(so, 0); if (count >= soextbkidlestat.so_xbkidle_maxperproc) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany); error = EBUSY; } else if (so->so_flags & SOF_DELEGATED) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd); error = EBUSY; } else { so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED; OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok); } SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] " "%s marked for extended bk idle\n", __func__, proc_selfpid(), proc_best_name(current_proc()), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so), (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? "is" : "not"); } return error; } static void so_stop_extended_bk_idle(struct socket *so) { so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG; so->so_extended_bk_start = 0; OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active); VERIFY(soextbkidlestat.so_xbkidle_active >= 0); /* * Force defunct */ sosetdefunct(current_proc(), so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); if (so->so_flags & SOF_DEFUNCT) { sodefunct(current_proc(), so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL); } } void so_drain_extended_bk_idle(struct socket *so) { if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) { /* * Only penalize sockets that have outstanding data */ if (so->so_rcv.sb_cc || so->so_snd.sb_cc) { so_stop_extended_bk_idle(so); OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained); } } } /* * Return values tells if socket is still in extended background idle */ int so_check_extended_bk_idle_time(struct socket *so) { int ret = 1; if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) { SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n", __func__, proc_selfpid(), proc_best_name(current_proc()), so->so_gencnt, SOCK_DOM(so), SOCK_TYPE(so)); if (net_uptime() - so->so_extended_bk_start > soextbkidlestat.so_xbkidle_time) { so_stop_extended_bk_idle(so); OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired); ret = 0; } else { struct inpcb *inp = (struct inpcb *)so->so_pcb; inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY); OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched); } } return ret; } void resume_proc_sockets(proc_t p) { if (p->p_ladvflag & P_LXBKIDLEINPROG) { struct fileproc *fp; struct socket *so; proc_fdlock(p); fdt_foreach(fp, p) { if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) { continue; } so = (struct socket *)fp_get_data(fp); (void) soresume(p, so, 0); } proc_fdunlock(p); OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag); } } __private_extern__ int so_set_recv_anyif(struct socket *so, int optval) { int ret = 0; if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { if (optval) { sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF; } else { sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF; } #if SKYWALK inp_update_netns_flags(so); #endif /* SKYWALK */ } return ret; } __private_extern__ int so_get_recv_anyif(struct socket *so) { int ret = 0; if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0; } return ret; } int so_set_restrictions(struct socket *so, uint32_t vals) { int nocell_old, nocell_new; int noexpensive_old, noexpensive_new; int noconstrained_old, noconstrained_new; /* * Deny-type restrictions are trapdoors; once set they cannot be * unset for the lifetime of the socket. This allows them to be * issued by a framework on behalf of the application without * having to worry that they can be undone. * * Note here that socket-level restrictions overrides any protocol * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR * socket restriction issued on the socket has a higher precendence * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only, * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued. */ nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED); so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN | SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED)); nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR); noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE); noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED); /* we can only set, not clear restrictions */ if ((nocell_new - nocell_old) == 0 && (noexpensive_new - noexpensive_old) == 0 && (noconstrained_new - noconstrained_old) == 0) { return 0; } if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) { if (nocell_new - nocell_old != 0) { /* * if deny cellular is now set, do what's needed * for INPCB */ inp_set_nocellular(sotoinpcb(so)); } if (noexpensive_new - noexpensive_old != 0) { inp_set_noexpensive(sotoinpcb(so)); } if (noconstrained_new - noconstrained_old != 0) { inp_set_noconstrained(sotoinpcb(so)); } } if (SOCK_DOM(so) == PF_MULTIPATH) { mptcp_set_restrictions(so); } return 0; } uint32_t so_get_restrictions(struct socket *so) { return so->so_restrictions & (SO_RESTRICT_DENY_IN | SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE); } int so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred) { struct proc *ep = PROC_NULL; int error = 0; /* pid 0 is reserved for kernel */ if (epid == 0) { error = EINVAL; goto done; } /* * If this is an in-kernel socket, prevent its delegate * association from changing unless the socket option is * coming from within the kernel itself. */ if (so->last_pid == 0 && p != kernproc) { error = EACCES; goto done; } /* * If this is issued by a process that's recorded as the * real owner of the socket, or if the pid is the same as * the process's own pid, then proceed. Otherwise ensure * that the issuing process has the necessary privileges. */ if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) { if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { error = EACCES; goto done; } } /* Find the process that corresponds to the effective pid */ if ((ep = proc_find(epid)) == PROC_NULL) { error = ESRCH; goto done; } /* * If a process tries to delegate the socket to itself, then * there's really nothing to do; treat it as a way for the * delegate association to be cleared. Note that we check * the passed-in proc rather than calling proc_selfpid(), * as we need to check the process issuing the socket option * which could be kernproc. Given that we don't allow 0 for * effective pid, it means that a delegated in-kernel socket * stays delegated during its lifetime (which is probably OK.) */ if (epid == proc_pid(p)) { so->so_flags &= ~SOF_DELEGATED; so->e_upid = 0; so->e_pid = 0; uuid_clear(so->e_uuid); } else { so->so_flags |= SOF_DELEGATED; so->e_upid = proc_uniqueid(ep); so->e_pid = proc_pid(ep); proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid)); #if defined(XNU_TARGET_OS_OSX) if (ep->p_responsible_pid != so->e_pid) { proc_t rp = proc_find(ep->p_responsible_pid); if (rp != PROC_NULL) { proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid)); so->so_rpid = ep->p_responsible_pid; proc_rele(rp); } else { uuid_clear(so->so_ruuid); so->so_rpid = -1; } } #endif } if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { (*so->so_proto->pr_update_last_owner)(so, NULL, ep); } done: if (error == 0 && net_io_policy_log) { uuid_string_t buf; uuid_unparse(so->e_uuid, buf); log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf, ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); } else if (error != 0 && net_io_policy_log) { log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) " "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" : proc_name_address(ep), error); } /* Update this socket's policy upon success */ if (error == 0) { so->so_policy_gencnt *= -1; so_update_policy(so); #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ } if (ep != PROC_NULL) { proc_rele(ep); } return error; } int so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred) { uuid_string_t buf; uuid_t uuid; int error = 0; /* UUID must not be all-zeroes (reserved for kernel) */ if (uuid_is_null(euuid)) { error = EINVAL; goto done; } /* * If this is an in-kernel socket, prevent its delegate * association from changing unless the socket option is * coming from within the kernel itself. */ if (so->last_pid == 0 && p != kernproc) { error = EACCES; goto done; } /* Get the UUID of the issuing process */ proc_getexecutableuuid(p, uuid, sizeof(uuid)); /* * If this is issued by a process that's recorded as the * real owner of the socket, or if the uuid is the same as * the process's own uuid, then proceed. Otherwise ensure * that the issuing process has the necessary privileges. */ if (check_cred && (uuid_compare(euuid, so->last_uuid) != 0 || uuid_compare(euuid, uuid) != 0)) { if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) { error = EACCES; goto done; } } /* * If a process tries to delegate the socket to itself, then * there's really nothing to do; treat it as a way for the * delegate association to be cleared. Note that we check * the uuid of the passed-in proc rather than that of the * current process, as we need to check the process issuing * the socket option which could be kernproc itself. Given * that we don't allow 0 for effective uuid, it means that * a delegated in-kernel socket stays delegated during its * lifetime (which is okay.) */ if (uuid_compare(euuid, uuid) == 0) { so->so_flags &= ~SOF_DELEGATED; so->e_upid = 0; so->e_pid = 0; uuid_clear(so->e_uuid); } else { so->so_flags |= SOF_DELEGATED; /* * Unlike so_set_effective_pid(), we only have the UUID * here and the process ID is not known. Inherit the * real {pid,upid} of the socket. */ so->e_upid = so->last_upid; so->e_pid = so->last_pid; uuid_copy(so->e_uuid, euuid); } /* * The following will clear the effective process name as it's the same * as the real process */ if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { (*so->so_proto->pr_update_last_owner)(so, NULL, NULL); } done: if (error == 0 && net_io_policy_log) { uuid_unparse(so->e_uuid, buf); log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d " "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), so->e_pid, buf, ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : "")); } else if (error != 0 && net_io_policy_log) { uuid_unparse(euuid, buf); log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s " "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), buf, error); } /* Update this socket's policy upon success */ if (error == 0) { so->so_policy_gencnt *= -1; so_update_policy(so); #if NECP so_update_necp_policy(so, NULL, NULL); #endif /* NECP */ } return error; } void netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data, uint32_t ev_datalen) { struct kev_msg ev_msg; /* * A netpolicy event always starts with a netpolicy_event_data * structure, but the caller can provide for a longer event * structure to post, depending on the event code. */ VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data)); bzero(&ev_msg, sizeof(ev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS; ev_msg.event_code = ev_code; ev_msg.dv[0].data_ptr = ev_data; ev_msg.dv[0].data_length = ev_datalen; kev_post_msg(&ev_msg); } void socket_post_kev_msg(uint32_t ev_code, struct kev_socket_event_data *ev_data, uint32_t ev_datalen) { struct kev_msg ev_msg; bzero(&ev_msg, sizeof(ev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS; ev_msg.event_code = ev_code; ev_msg.dv[0].data_ptr = ev_data; ev_msg.dv[0].data_length = ev_datalen; kev_post_msg(&ev_msg); } void socket_post_kev_msg_closed(struct socket *so) { struct kev_socket_closed ev = {}; struct sockaddr *socksa = NULL, *peersa = NULL; int err; if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) { return; } err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa); if (err == 0) { err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &peersa); if (err == 0) { memcpy(&ev.ev_data.kev_sockname, socksa, min(socksa->sa_len, sizeof(ev.ev_data.kev_sockname))); memcpy(&ev.ev_data.kev_peername, peersa, min(peersa->sa_len, sizeof(ev.ev_data.kev_peername))); socket_post_kev_msg(KEV_SOCKET_CLOSED, &ev.ev_data, sizeof(ev)); } } free_sockaddr(socksa); free_sockaddr(peersa); } __attribute__((noinline, cold, not_tail_called, noreturn)) __private_extern__ int assfail(const char *a, const char *f, int l) { panic("assertion failed: %s, file: %s, line: %d", a, f, l); /* NOTREACHED */ __builtin_unreachable(); }