/* * Copyright (c) 2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define sotovsockpcb(so) ((struct vsockpcb *)(so)->so_pcb) #define VSOCK_PORT_RESERVED 1024 /* VSock Protocol Globals */ static struct vsock_transport * _Atomic the_vsock_transport = NULL; static ZONE_DEFINE(vsockpcb_zone, "vsockpcbzone", sizeof(struct vsockpcb), ZC_NONE); static LCK_GRP_DECLARE(vsock_lock_grp, "vsock"); static struct vsockpcbinfo vsockinfo; static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8; static uint32_t vsock_recvspace = VSOCK_MAX_PACKET_SIZE * 8; /* VSock PCB Helpers */ static uint32_t vsock_get_peer_space(struct vsockpcb *pcb) { return pcb->peer_buf_alloc - (pcb->tx_cnt - pcb->peer_fwd_cnt); } static struct vsockpcb * vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst) { struct vsockpcb *preferred = NULL; struct vsockpcb *match = NULL; struct vsockpcb *pcb = NULL; lck_rw_lock_shared(&vsockinfo.bound_lock); LIST_FOREACH(pcb, &vsockinfo.bound, bound) { // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration) socket_lock(pcb->so, 1); if ((pcb->so->so_state & SS_ISCONNECTED || pcb->so->so_state & SS_ISCONNECTING) && pcb->local_address.cid == src.cid && pcb->local_address.port == src.port && pcb->remote_address.port == dst.port) { preferred = pcb; break; } else if ((pcb->local_address.cid == src.cid || pcb->local_address.cid == VMADDR_CID_ANY) && pcb->local_address.port == src.port) { match = pcb; } socket_unlock(pcb->so, 1); } if (!preferred && match) { socket_lock(match->so, 1); preferred = match; } lck_rw_done(&vsockinfo.bound_lock); return preferred; } static errno_t vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port) { socket_lock_assert_owned(pcb->so); // Privileged ports. if (local_port != VMADDR_PORT_ANY && local_port < VSOCK_PORT_RESERVED && current_task() != kernel_task && proc_suser(current_proc()) != 0) { return EACCES; } bool taken = false; const bool check_remote = (remote_cid != VMADDR_CID_ANY && remote_port != VMADDR_PORT_ANY); struct vsockpcb *pcb_match = NULL; socket_unlock(pcb->so, 0); lck_rw_lock_exclusive(&vsockinfo.bound_lock); LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) { socket_lock(pcb_match->so, 1); if (pcb == pcb_match || (!check_remote && pcb_match->local_address.port == local_port) || (check_remote && pcb_match->local_address.port == local_port && pcb_match->remote_address.cid == remote_cid && pcb_match->remote_address.port == remote_port)) { socket_unlock(pcb_match->so, 1); taken = true; break; } socket_unlock(pcb_match->so, 1); } socket_lock(pcb->so, 0); if (!taken) { pcb->local_address = (struct vsock_address) { .cid = local_cid, .port = local_port }; pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port }; LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound); } lck_rw_done(&vsockinfo.bound_lock); return taken ? EADDRINUSE : 0; } static errno_t vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsock_address raddr) { if (!pcb) { return EINVAL; } socket_lock_assert_owned(pcb->so); // Certain CIDs are reserved. if (laddr.cid == VMADDR_CID_HYPERVISOR || laddr.cid == VMADDR_CID_RESERVED || laddr.cid == VMADDR_CID_HOST) { return EADDRNOTAVAIL; } // Remote address must be fully specified or not specified at all. if ((raddr.cid == VMADDR_CID_ANY) ^ (raddr.port == VMADDR_PORT_ANY)) { return EINVAL; } // Cannot bind if already bound. if (pcb->local_address.port != VMADDR_PORT_ANY) { return EINVAL; } uint32_t transport_cid; struct vsock_transport *transport = pcb->transport; errno_t error = transport->get_cid(transport->provider, &transport_cid); if (error) { return error; } // Local CID must be this transport's CID or any. if (laddr.cid != transport_cid && laddr.cid != VMADDR_CID_ANY) { return EINVAL; } if (laddr.port != VMADDR_PORT_ANY) { error = vsock_bind_address_if_free(pcb, laddr.cid, laddr.port, raddr.cid, raddr.port); } else { socket_unlock(pcb->so, 0); lck_mtx_lock(&vsockinfo.port_lock); socket_lock(pcb->so, 0); const uint32_t first = VSOCK_PORT_RESERVED; const uint32_t last = VMADDR_PORT_ANY - 1; uint32_t count = last - first + 1; uint32_t *last_port = &vsockinfo.last_port; if (pcb->so->so_flags & SOF_BINDRANDOMPORT) { uint32_t random = 0; read_frandom(&random, sizeof(random)); *last_port = first + (random % count); } do { if (count == 0) { lck_mtx_unlock(&vsockinfo.port_lock); return EADDRNOTAVAIL; } count--; ++*last_port; if (*last_port < first || *last_port > last) { *last_port = first; } error = vsock_bind_address_if_free(pcb, laddr.cid, *last_port, raddr.cid, raddr.port); } while (error); lck_mtx_unlock(&vsockinfo.port_lock); } return error; } static void vsock_unbind_pcb_locked(struct vsockpcb *pcb, bool is_locked) { if (!pcb) { return; } struct socket *so = pcb->so; socket_lock_assert_owned(so); // Bail if disconnect and already unbound. if (so->so_state & SS_ISDISCONNECTED) { assert(pcb->bound.le_next == NULL); assert(pcb->bound.le_prev == NULL); return; } if (!is_locked) { socket_unlock(so, 0); lck_rw_lock_exclusive(&vsockinfo.bound_lock); socket_lock(so, 0); // Case where some other thread also called unbind() on this socket while waiting to acquire its lock. if (!pcb->bound.le_prev) { soisdisconnected(so); lck_rw_done(&vsockinfo.bound_lock); return; } } soisdisconnected(so); LIST_REMOVE(pcb, bound); pcb->bound.le_next = NULL; pcb->bound.le_prev = NULL; if (!is_locked) { lck_rw_done(&vsockinfo.bound_lock); } } static void vsock_unbind_pcb(struct vsockpcb *pcb) { vsock_unbind_pcb_locked(pcb, false); } static struct sockaddr * vsock_new_sockaddr(struct vsock_address *address) { if (!address) { return NULL; } struct sockaddr_vm *addr; addr = (struct sockaddr_vm *)alloc_sockaddr(sizeof(*addr), Z_WAITOK | Z_NOFAIL); addr->svm_family = AF_VSOCK; addr->svm_port = address->port; addr->svm_cid = address->cid; return (struct sockaddr *)addr; } static errno_t vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbuf_t m) { if (!pcb) { if (m != NULL) { mbuf_freem_list(m); } return EINVAL; } socket_lock_assert_owned(pcb->so); errno_t error; struct vsock_address dst = pcb->remote_address; if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) { if (m != NULL) { mbuf_freem_list(m); } return EINVAL; } struct vsock_address src = pcb->local_address; if (src.cid == VMADDR_CID_ANY) { uint32_t transport_cid; struct vsock_transport *transport = pcb->transport; error = transport->get_cid(transport->provider, &transport_cid); if (error) { if (m != NULL) { mbuf_freem_list(m); } return error; } src.cid = transport_cid; } uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat; uint32_t fwd_cnt = pcb->fwd_cnt; if (src.cid == dst.cid) { pcb->last_buf_alloc = buf_alloc; pcb->last_fwd_cnt = fwd_cnt; socket_unlock(pcb->so, 0); error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m); socket_lock(pcb->so, 0); } else { struct vsock_transport *transport = pcb->transport; error = transport->put_message(transport->provider, src, dst, operation, buf_alloc, fwd_cnt, m); if (!error) { pcb->last_buf_alloc = buf_alloc; pcb->last_fwd_cnt = fwd_cnt; } } return error; } static errno_t vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) { if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) { return EINVAL; } errno_t error = 0; struct vsock_transport *transport = NULL; if (src.cid == VMADDR_CID_ANY) { transport = os_atomic_load(&the_vsock_transport, relaxed); if (transport == NULL) { return ENODEV; } uint32_t transport_cid; error = transport->get_cid(transport->provider, &transport_cid); if (error) { return error; } src.cid = transport_cid; } if (src.cid == dst.cid) { // Reset both sockets. struct vsockpcb *pcb = vsock_get_matching_pcb(src, dst); if (pcb) { socket_lock_assert_owned(pcb->so); vsock_unbind_pcb(pcb); socket_unlock(pcb->so, 1); } } else { if (!transport) { transport = os_atomic_load(&the_vsock_transport, relaxed); if (transport == NULL) { return ENODEV; } } error = transport->put_message(transport->provider, src, dst, VSOCK_RESET, 0, 0, NULL); } return error; } static errno_t vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst) { if (pcb) { socket_lock_assert_owned(pcb->so); socket_unlock(pcb->so, 0); } errno_t error = vsock_pcb_reset_address(src, dst); if (pcb) { socket_lock(pcb->so, 0); } return error; } static errno_t vsock_pcb_connect(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_REQUEST, NULL); } static errno_t vsock_pcb_respond(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_RESPONSE, NULL); } static errno_t vsock_pcb_send(struct vsockpcb *pcb, mbuf_t m) { return vsock_pcb_send_message(pcb, VSOCK_PAYLOAD, m); } static errno_t vsock_pcb_shutdown_send(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_SHUTDOWN_SEND, NULL); } static errno_t vsock_pcb_reset(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_RESET, NULL); } static errno_t vsock_pcb_credit_update(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_CREDIT_UPDATE, NULL); } static errno_t vsock_pcb_credit_request(struct vsockpcb *pcb) { return vsock_pcb_send_message(pcb, VSOCK_CREDIT_REQUEST, NULL); } static errno_t vsock_disconnect_pcb_common(struct vsockpcb *pcb, bool is_locked) { socket_lock_assert_owned(pcb->so); vsock_unbind_pcb_locked(pcb, is_locked); return vsock_pcb_reset(pcb); } static errno_t vsock_disconnect_pcb_locked(struct vsockpcb *pcb) { return vsock_disconnect_pcb_common(pcb, true); } static errno_t vsock_disconnect_pcb(struct vsockpcb *pcb) { return vsock_disconnect_pcb_common(pcb, false); } static errno_t vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr) { if (!pcb || !pcb->so || !addr) { return EINVAL; } // Validate address length. if (addr->svm_len < sizeof(struct sockaddr_vm)) { return EINVAL; } // Validate address family. if (addr->svm_family != AF_UNSPEC && addr->svm_family != AF_VSOCK) { return EAFNOSUPPORT; } // Only stream is supported currently. if (pcb->so->so_type != SOCK_STREAM) { return EAFNOSUPPORT; } return 0; } /* VSock Receive Handlers */ static errno_t vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_t m) { socket_lock_assert_owned(pcb->so); errno_t error = 0; switch (op) { case VSOCK_SHUTDOWN: socantsendmore(pcb->so); socantrcvmore(pcb->so); break; case VSOCK_SHUTDOWN_RECEIVE: socantsendmore(pcb->so); break; case VSOCK_SHUTDOWN_SEND: socantrcvmore(pcb->so); break; case VSOCK_PAYLOAD: // Add data to the receive queue then wakeup any reading threads. error = !sbappendstream(&pcb->so->so_rcv, m); if (!error) { sorwakeup(pcb->so); } break; case VSOCK_RESET: vsock_unbind_pcb(pcb); break; default: error = ENOTSUP; break; } return error; } static errno_t vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op) { socket_lock_assert_owned(pcb->so); errno_t error = 0; switch (op) { case VSOCK_RESPONSE: soisconnected(pcb->so); break; case VSOCK_RESET: pcb->so->so_error = EAGAIN; error = vsock_disconnect_pcb(pcb); break; default: vsock_disconnect_pcb(pcb); error = ENOTSUP; break; } return error; } static errno_t vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst) { socket_lock_assert_owned(pcb->so); struct sockaddr_vm addr; struct socket *so2 = NULL; struct vsockpcb *pcb2 = NULL; errno_t error = 0; switch (op) { case VSOCK_REQUEST: addr = (struct sockaddr_vm) { .svm_len = sizeof(addr), .svm_family = AF_VSOCK, .svm_reserved1 = 0, .svm_port = pcb->local_address.port, .svm_cid = pcb->local_address.cid }; so2 = sonewconn(pcb->so, 0, (struct sockaddr *)&addr); if (!so2) { // It is likely that the backlog is full. Deny this request. vsock_pcb_safe_reset_address(pcb, dst, src); error = ECONNREFUSED; break; } pcb2 = sotovsockpcb(so2); if (!pcb2) { error = EINVAL; goto done; } error = vsock_bind_address(pcb2, dst, src); if (error) { goto done; } error = vsock_pcb_respond(pcb2); if (error) { goto done; } soisconnected(so2); done: if (error) { if (pcb2) { vsock_unbind_pcb(pcb2); } else { soisdisconnected(so2); } socket_unlock(so2, 1); vsock_pcb_reset_address(dst, src); } else { socket_unlock(so2, 0); } socket_lock(pcb->so, 0); break; case VSOCK_RESET: error = vsock_pcb_safe_reset_address(pcb, dst, src); break; default: vsock_pcb_safe_reset_address(pcb, dst, src); error = ENOTSUP; break; } return error; } /* VSock Transport */ errno_t vsock_add_transport(struct vsock_transport *transport) { if (transport == NULL || transport->provider == NULL) { return EINVAL; } if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, NULL, transport, acq_rel)) { return EEXIST; } return 0; } errno_t vsock_remove_transport(struct vsock_transport *transport) { if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, transport, NULL, acq_rel)) { return ENODEV; } return 0; } errno_t vsock_reset_transport(struct vsock_transport *transport) { if (transport == NULL) { return EINVAL; } errno_t error = 0; struct vsockpcb *pcb = NULL; struct vsockpcb *tmp_pcb = NULL; lck_rw_lock_exclusive(&vsockinfo.bound_lock); LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) { // Disconnect this transport's sockets. Listen and bind sockets must stay alive. socket_lock(pcb->so, 1); if (pcb->transport == transport && pcb->so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) { errno_t dc_error = vsock_disconnect_pcb_locked(pcb); if (dc_error && !error) { error = dc_error; } } socket_unlock(pcb->so, 1); } lck_rw_done(&vsockinfo.bound_lock); return error; } errno_t vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m) { struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src); if (!pcb) { if (op != VSOCK_RESET) { vsock_pcb_reset_address(dst, src); } if (m != NULL) { mbuf_freem_list(m); } return EINVAL; } socket_lock_assert_owned(pcb->so); struct socket *so = pcb->so; errno_t error = 0; // Check if the peer's buffer has changed. Update our view of the peer's forwarded bytes. int buffers_changed = (pcb->peer_buf_alloc != buf_alloc) || (pcb->peer_fwd_cnt) != fwd_cnt; pcb->peer_buf_alloc = buf_alloc; pcb->peer_fwd_cnt = fwd_cnt; // Peer's buffer has enough space for the next packet. Notify any threads waiting for space. if (buffers_changed && vsock_get_peer_space(pcb) >= pcb->waiting_send_size) { sowwakeup(so); } switch (op) { case VSOCK_CREDIT_REQUEST: error = vsock_pcb_credit_update(pcb); break; case VSOCK_CREDIT_UPDATE: break; default: if (so->so_state & SS_ISCONNECTED) { error = vsock_put_message_connected(pcb, op, m); m = NULL; } else if (so->so_state & SS_ISCONNECTING) { error = vsock_put_message_connecting(pcb, op); } else if (so->so_options & SO_ACCEPTCONN) { error = vsock_put_message_listening(pcb, op, src, dst); } else { // Reset the connection for other states such as 'disconnecting'. error = vsock_disconnect_pcb(pcb); if (!error) { error = ENODEV; } } break; } socket_unlock(so, 1); if (m != NULL) { mbuf_freem_list(m); } return error; } /* VSock Sysctl */ static int vsock_pcblist SYSCTL_HANDLER_ARGS { #pragma unused(oidp,arg2) int error; // Only stream is supported. if ((intptr_t)arg1 != SOCK_STREAM) { return EINVAL; } // Get the generation count and the count of all vsock sockets. lck_rw_lock_shared(&vsockinfo.all_lock); uint64_t n = vsockinfo.all_pcb_count; vsock_gen_t gen_count = vsockinfo.vsock_gencnt; lck_rw_done(&vsockinfo.all_lock); const size_t xpcb_len = sizeof(struct xvsockpcb); struct xvsockpgen xvg; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == USER_ADDR_NULL) { req->oldidx = (size_t)(2 * sizeof(xvg) + (n + n / 8) * xpcb_len); return 0; } if (req->newptr != USER_ADDR_NULL) { return EPERM; } bzero(&xvg, sizeof(xvg)); xvg.xvg_len = sizeof(xvg); xvg.xvg_count = n; xvg.xvg_gen = gen_count; xvg.xvg_sogen = so_gencnt; error = SYSCTL_OUT(req, &xvg, sizeof(xvg)); if (error) { return error; } // Return if no sockets exist. if (n == 0) { return 0; } lck_rw_lock_shared(&vsockinfo.all_lock); n = 0; struct vsockpcb *pcb = NULL; TAILQ_FOREACH(pcb, &vsockinfo.all, all) { // Bail if there is not enough user buffer for this next socket. if (req->oldlen - req->oldidx - sizeof(xvg) < xpcb_len) { break; } // Populate the socket structure. socket_lock(pcb->so, 1); if (pcb->vsock_gencnt <= gen_count) { struct xvsockpcb xpcb; bzero(&xpcb, xpcb_len); xpcb.xv_len = xpcb_len; xpcb.xv_vsockpp = (uint64_t)VM_KERNEL_ADDRHASH(pcb); xpcb.xvp_local_cid = pcb->local_address.cid; xpcb.xvp_local_port = pcb->local_address.port; xpcb.xvp_remote_cid = pcb->remote_address.cid; xpcb.xvp_remote_port = pcb->remote_address.port; xpcb.xvp_rxcnt = pcb->fwd_cnt; xpcb.xvp_txcnt = pcb->tx_cnt; xpcb.xvp_peer_rxhiwat = pcb->peer_buf_alloc; xpcb.xvp_peer_rxcnt = pcb->peer_fwd_cnt; xpcb.xvp_last_pid = pcb->so->last_pid; xpcb.xvp_gencnt = pcb->vsock_gencnt; if (pcb->so) { sotoxsocket(pcb->so, &xpcb.xv_socket); } socket_unlock(pcb->so, 1); error = SYSCTL_OUT(req, &xpcb, xpcb_len); if (error != 0) { break; } n++; } else { socket_unlock(pcb->so, 1); } } // Update the generation count to match the sockets being returned. gen_count = vsockinfo.vsock_gencnt; lck_rw_done(&vsockinfo.all_lock); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ bzero(&xvg, sizeof(xvg)); xvg.xvg_len = sizeof(xvg); xvg.xvg_count = n; xvg.xvg_gen = gen_count; xvg.xvg_sogen = so_gencnt; error = SYSCTL_OUT(req, &xvg, sizeof(xvg)); } return error; } #ifdef SYSCTL_DECL SYSCTL_NODE(_net, OID_AUTO, vsock, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock"); SYSCTL_UINT(_net_vsock, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED, &vsock_sendspace, 0, "Maximum outgoing vsock datagram size"); SYSCTL_UINT(_net_vsock, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &vsock_recvspace, 0, "Maximum incoming vsock datagram size"); SYSCTL_PROC(_net_vsock, OID_AUTO, pcblist, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_STREAM, 0, vsock_pcblist, "S,xvsockpcb", "List of active vsock sockets"); #endif /* VSock Protocol */ static int vsock_attach(struct socket *so, int proto, struct proc *p) { #pragma unused(proto, p) // Reserve send and receive buffers. errno_t error = soreserve(so, vsock_sendspace, vsock_recvspace); if (error) { return error; } // Attach should only be run once per socket. struct vsockpcb *pcb = sotovsockpcb(so); if (pcb) { return EINVAL; } // Get the transport for this socket. struct vsock_transport *transport = os_atomic_load(&the_vsock_transport, relaxed); if (transport == NULL) { return ENODEV; } // Initialize the vsock protocol control block. pcb = zalloc_flags(vsockpcb_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); pcb->so = so; pcb->transport = transport; pcb->local_address = (struct vsock_address) { .cid = VMADDR_CID_ANY, .port = VMADDR_PORT_ANY }; pcb->remote_address = (struct vsock_address) { .cid = VMADDR_CID_ANY, .port = VMADDR_PORT_ANY }; so->so_pcb = pcb; // Tell the transport that this socket has attached. error = transport->attach_socket(transport->provider); if (error) { zfree(vsockpcb_zone, pcb); so->so_pcb = NULL; return error; } // Add to the list of all vsock sockets. lck_rw_lock_exclusive(&vsockinfo.all_lock); TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all); vsockinfo.all_pcb_count++; pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt; lck_rw_done(&vsockinfo.all_lock); return 0; } static int vsock_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p) { #pragma unused(ifp) VERIFY(so != NULL || p == kernproc); if (cmd != IOCTL_VM_SOCKETS_GET_LOCAL_CID) { return EINVAL; } struct vsock_transport *transport; if (so) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } transport = pcb->transport; } else { transport = os_atomic_load(&the_vsock_transport, relaxed); } if (transport == NULL) { return ENODEV; } uint32_t transport_cid; errno_t error = transport->get_cid(transport->provider, &transport_cid); if (error) { return error; } memcpy(data, &transport_cid, sizeof(transport_cid)); return 0; } static int vsock_detach(struct socket *so) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } vsock_unbind_pcb(pcb); // Tell the transport that this socket has detached. struct vsock_transport *transport = pcb->transport; errno_t error = transport->detach_socket(transport->provider); if (error) { return error; } // Remove from the list of all vsock sockets. lck_rw_lock_exclusive(&vsockinfo.all_lock); TAILQ_REMOVE(&vsockinfo.all, pcb, all); pcb->all.tqe_next = NULL; pcb->all.tqe_prev = NULL; vsockinfo.all_pcb_count--; vsockinfo.vsock_gencnt++; lck_rw_done(&vsockinfo.all_lock); // Mark this socket for deallocation. so->so_flags |= SOF_PCBCLEARING; return 0; } static int vsock_abort(struct socket *so) { return vsock_detach(so); } static int vsock_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { #pragma unused(p) struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } struct sockaddr_vm *addr = (struct sockaddr_vm *)nam; errno_t error = vsock_sockaddr_vm_validate(pcb, addr); if (error) { return error; } struct vsock_address laddr = (struct vsock_address) { .cid = addr->svm_cid, .port = addr->svm_port, }; struct vsock_address raddr = (struct vsock_address) { .cid = VMADDR_CID_ANY, .port = VMADDR_PORT_ANY, }; error = vsock_bind_address(pcb, laddr, raddr); if (error) { return error; } return 0; } static int vsock_listen(struct socket *so, struct proc *p) { #pragma unused(p) struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } // Only stream is supported currently. if (so->so_type != SOCK_STREAM) { return EAFNOSUPPORT; } struct vsock_address *addr = &pcb->local_address; if (addr->port == VMADDR_CID_ANY) { return EFAULT; } struct vsock_transport *transport = pcb->transport; uint32_t transport_cid; errno_t error = transport->get_cid(transport->provider, &transport_cid); if (error) { return error; } // Can listen on the transport's cid or any. if (addr->cid != transport_cid && addr->cid != VMADDR_CID_ANY) { return EFAULT; } return 0; } static int vsock_accept(struct socket *so, struct sockaddr **nam) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } // Do not accept disconnected sockets. if (so->so_state & SS_ISDISCONNECTED) { return ECONNABORTED; } *nam = vsock_new_sockaddr(&pcb->remote_address); return 0; } static int vsock_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { #pragma unused(p) struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } struct sockaddr_vm *addr = (struct sockaddr_vm *)nam; errno_t error = vsock_sockaddr_vm_validate(pcb, addr); if (error) { return error; } uint32_t transport_cid; struct vsock_transport *transport = pcb->transport; error = transport->get_cid(transport->provider, &transport_cid); if (error) { return error; } // Only supporting connections to the host, hypervisor, or self for now. if (addr->svm_cid != VMADDR_CID_HOST && addr->svm_cid != VMADDR_CID_HYPERVISOR && addr->svm_cid != transport_cid) { return EFAULT; } soisconnecting(so); // Set the remote and local address. struct vsock_address remote_addr = (struct vsock_address) { .cid = addr->svm_cid, .port = addr->svm_port, }; struct vsock_address local_addr = (struct vsock_address) { .cid = transport_cid, .port = VMADDR_PORT_ANY, }; // Bind to the address. error = vsock_bind_address(pcb, local_addr, remote_addr); if (error) { goto cleanup; } // Attempt a connection using the socket's transport. error = vsock_pcb_connect(pcb); if (error) { goto cleanup; } if ((so->so_state & SS_ISCONNECTED) == 0) { // Don't wait for peer's response if non-blocking. if (so->so_state & SS_NBIO) { error = EINPROGRESS; goto done; } struct timespec ts = (struct timespec) { .tv_sec = so->so_snd.sb_timeo.tv_sec, .tv_nsec = so->so_snd.sb_timeo.tv_usec * 1000, }; lck_mtx_t *mutex_held; if (so->so_proto->pr_getlock != NULL) { mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); } else { mutex_held = so->so_proto->pr_domain->dom_mtx; } // Wait until we receive a response to the connect request. error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH, "vsock_connect", &ts); if (error) { if (error == EAGAIN) { error = ETIMEDOUT; } goto cleanup; } } cleanup: if (so->so_error && !error) { error = so->so_error; so->so_error = 0; } if (!error) { error = !(so->so_state & SS_ISCONNECTED); } if (error) { vsock_unbind_pcb(pcb); } done: return error; } static int vsock_disconnect(struct socket *so) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } return vsock_disconnect_pcb(pcb); } static int vsock_sockaddr(struct socket *so, struct sockaddr **nam) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } *nam = vsock_new_sockaddr(&pcb->local_address); return 0; } static int vsock_peeraddr(struct socket *so, struct sockaddr **nam) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } *nam = vsock_new_sockaddr(&pcb->remote_address); return 0; } static int vsock_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, proc_t p) { #pragma unused(flags, nam, p) struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL || m == NULL) { return EINVAL; } if (control != NULL) { m_freem(control); return EOPNOTSUPP; } // Ensure this socket is connected. if ((so->so_state & SS_ISCONNECTED) == 0) { if (m != NULL) { mbuf_freem_list(m); } return EPERM; } errno_t error; // rdar://84098487 (SEED: Web: Virtio-socket sent data lost after 128KB) // For writes larger than the default `sosendmaxchain` of 65536, vsock_send() is called multiple times per write(). // Only the first call to vsock_send() is passed a valid mbuf packet, while subsequent calls are not marked as a packet // with a valid length. We should mark all mbufs as a packet and set the correct packet length so that the downstream // socket transport layer can correctly generate physical segments. if (!(mbuf_flags(m) & MBUF_PKTHDR)) { if (!(mbuf_flags(m) & M_EXT)) { struct mbuf *header = NULL; MGETHDR(header, M_WAITOK, MT_HEADER); if (header == NULL) { if (m != NULL) { mbuf_freem_list(m); } return ENOBUFS; } header->m_next = m; m = header; } else { mbuf_setflags(m, mbuf_flags(m) | MBUF_PKTHDR); } size_t len = 0; struct mbuf *next = m; while (next) { len += mbuf_len(next); next = mbuf_next(next); } mbuf_pkthdr_setlen(m, len); } const size_t len = mbuf_pkthdr_len(m); uint32_t free_space = vsock_get_peer_space(pcb); // Ensure the peer has enough space in their receive buffer. while (len > free_space) { // Record the number of free peer bytes necessary before we can send. if (len > pcb->waiting_send_size) { pcb->waiting_send_size = len; } // Send a credit request. error = vsock_pcb_credit_request(pcb); if (error) { if (m != NULL) { mbuf_freem_list(m); } return error; } // Check again in case free space was automatically updated in loopback case. free_space = vsock_get_peer_space(pcb); if (len <= free_space) { pcb->waiting_send_size = 0; break; } // Bail if this is a non-blocking socket. if (so->so_state & SS_NBIO) { if (m != NULL) { mbuf_freem_list(m); } return EWOULDBLOCK; } // Wait until our peer has enough free space in their receive buffer. error = sbwait(&so->so_snd); pcb->waiting_send_size = 0; if (error) { if (m != NULL) { mbuf_freem_list(m); } return error; } // Bail if an error occured or we can't send more. if (so->so_state & SS_CANTSENDMORE) { if (m != NULL) { mbuf_freem_list(m); } return EPIPE; } else if (so->so_error) { error = so->so_error; so->so_error = 0; if (m != NULL) { mbuf_freem_list(m); } return error; } free_space = vsock_get_peer_space(pcb); } // Send a payload over the transport. error = vsock_pcb_send(pcb, m); if (error) { return error; } pcb->tx_cnt += len; return 0; } static int vsock_shutdown(struct socket *so) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } socantsendmore(so); // Tell peer we will no longer send. errno_t error = vsock_pcb_shutdown_send(pcb); if (error) { return error; } return 0; } static int vsock_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct vsockpcb *pcb = sotovsockpcb(so); if (pcb == NULL) { return EINVAL; } user_ssize_t length = uio_resid(uio); int result = soreceive(so, psa, uio, mp0, controlp, flagsp); length -= uio_resid(uio); socket_lock(so, 1); pcb->fwd_cnt += length; const uint32_t threshold = VSOCK_MAX_PACKET_SIZE; // Send a credit update if is possible that the peer will no longer send. if ((pcb->fwd_cnt - pcb->last_fwd_cnt + threshold) >= pcb->last_buf_alloc) { errno_t error = vsock_pcb_credit_update(pcb); if (!result && error) { result = error; } } socket_unlock(so, 1); return result; } static struct pr_usrreqs vsock_usrreqs = { .pru_abort = vsock_abort, .pru_attach = vsock_attach, .pru_control = vsock_control, .pru_detach = vsock_detach, .pru_bind = vsock_bind, .pru_listen = vsock_listen, .pru_accept = vsock_accept, .pru_connect = vsock_connect, .pru_disconnect = vsock_disconnect, .pru_send = vsock_send, .pru_shutdown = vsock_shutdown, .pru_sockaddr = vsock_sockaddr, .pru_peeraddr = vsock_peeraddr, .pru_sosend = sosend, .pru_soreceive = vsock_soreceive, }; static void vsock_init(struct protosw *pp, struct domain *dp) { #pragma unused(dp) static int vsock_initialized = 0; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized, 0, 1, acq_rel)) { return; } // Setup VSock protocol info struct. lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL); lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL); lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL); TAILQ_INIT(&vsockinfo.all); LIST_INIT(&vsockinfo.bound); vsockinfo.last_port = VMADDR_PORT_ANY; } static int vsock_sofreelastref(struct socket *so, int dealloc) { socket_lock_assert_owned(so); struct vsockpcb *pcb = sotovsockpcb(so); if (pcb != NULL) { zfree(vsockpcb_zone, pcb); } so->so_pcb = NULL; sofreelastref(so, dealloc); return 0; } static int vsock_unlock(struct socket *so, int refcount, void *lr_saved) { lck_mtx_t *mutex_held = so->so_proto->pr_domain->dom_mtx; #ifdef MORE_LOCKING_DEBUG LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); #endif so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX; if (refcount) { if (so->so_usecount <= 0) { panic("%s: bad refcount=%d so=%p (%d, %d, %d) " "lrh=%s", __func__, so->so_usecount, so, SOCK_DOM(so), so->so_type, SOCK_PROTO(so), solockhistory_nr(so)); /* NOTREACHED */ } so->so_usecount--; if (so->so_usecount == 0) { vsock_sofreelastref(so, 1); } } lck_mtx_unlock(mutex_held); return 0; } static struct protosw vsocksw[] = { { .pr_type = SOCK_STREAM, .pr_protocol = 0, .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD, .pr_init = vsock_init, .pr_unlock = vsock_unlock, .pr_usrreqs = &vsock_usrreqs, } }; static const int vsock_proto_count = (sizeof(vsocksw) / sizeof(struct protosw)); /* VSock Domain */ static struct domain *vsock_domain = NULL; static void vsock_dinit(struct domain *dp) { // The VSock domain is initialized with a singleton pattern. VERIFY(!(dp->dom_flags & DOM_INITIALIZED)); VERIFY(vsock_domain == NULL); vsock_domain = dp; // Add protocols and initialize. for (int i = 0; i < vsock_proto_count; i++) { net_add_proto((struct protosw *)&vsocksw[i], dp, 1); } } struct domain vsockdomain_s = { .dom_family = PF_VSOCK, .dom_name = "vsock", .dom_init = vsock_dinit, .dom_maxrtkey = sizeof(struct sockaddr_vm), .dom_protohdrlen = sizeof(struct sockaddr_vm), };