/* * Copyright (c) 2012-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Notes on MPTCP implementation. * * MPTCP is implemented as protocol in PF_MULTIPATH * communication domain. The structure mtcbinfo describes the MPTCP instance * of a Multipath protocol in that domain. It is used to keep track of all * MPTCP PCB instances in the system, and is protected by the global lock * mppi_lock. * * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM, * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with * it comes an MPTCP Session and an MPTCP PCB. All three structures are * allocated from the same memory block, and each structure has a pointer * to the adjacent ones. The layout is defined by the mpp_mtp structure. * The socket lock (mpp_lock) is used to protect accesses to the Multipath * PCB (mppcb) as well as the MPTCP Session (mptses). * * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB; * * A functioning MPTCP Session consists of one or more subflow sockets. Each * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is * represented by the mptsub structure. Because each subflow requires access * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each * subflow. This gets decremented prior to the subflow's destruction. * * To handle events (read, write, control) from the subflows, we do direct * upcalls into the specific function. * * The whole MPTCP connection is protected by a single lock, the MPTCP socket's * lock. Incoming data on a subflow also ends up taking this single lock. To * achieve the latter, tcp_lock/unlock has been changed to rather use the lock * of the MPTCP-socket. * * An MPTCP socket will be destroyed when its so_usecount drops to zero; this * work is done by the MPTCP garbage collector which is invoked on demand by * the PF_MULTIPATH garbage collector. This process will take place once all * of the subflows have been destroyed. */ static void mptcp_subflow_abort(struct mptsub *, int); static void mptcp_send_dfin(struct socket *so); static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts); static int mptcp_freeq(struct mptcb *mp_tp); /* * Possible return values for subflow event handlers. Note that success * values must be greater or equal than MPTS_EVRET_OK. Values less than that * indicate errors or actions which require immediate attention; they will * prevent the rest of the handlers from processing their respective events * until the next round of events processing. */ typedef enum { MPTS_EVRET_DELETE = 1, /* delete this subflow */ MPTS_EVRET_OK = 2, /* OK */ MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */ MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */ } ev_ret_t; static void mptcp_do_sha1(mptcp_key_t *, char *); static void mptcp_do_sha256(mptcp_key_t *, char *); static void mptcp_init_local_parms(struct mptses *, struct sockaddr *); static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT); static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT); static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry, NET_KT_DEFAULT); struct mppcbinfo mtcbinfo; SYSCTL_DECL(_net_inet); SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP"); SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, &mtcbinfo.mppi_count, 0, "Number of active PCBs"); static int mptcp_alternate_port = 0; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections"); static struct protosw mptcp_subflow_protosw; static struct pr_usrreqs mptcp_subflow_usrreqs; static struct ip6protosw mptcp_subflow_protosw6; static struct pr_usrreqs mptcp_subflow_usrreqs6; static uint8_t mptcp_create_subflows_scheduled; /* Using Symptoms Advisory to detect poor WiFi or poor Cell */ static kern_ctl_ref mptcp_kern_ctrl_ref = NULL; static uint32_t mptcp_kern_skt_inuse = 0; static uint32_t mptcp_kern_skt_unit; static symptoms_advisory_t mptcp_advisory; uint32_t mptcp_cellicon_refcount = 0; os_log_t mptcp_log_handle; int mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create) { int i, index = -1; for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) { if (create && stats[i].ifindex == IFSCOPE_NONE) { if (index < 0) { index = i; } continue; } if (stats[i].ifindex == ifindex) { index = i; return index; } } if (index != -1) { stats[index].ifindex = ifindex; } return index; } static int mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; int index; if (ifp == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags); return -1; } index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true); if (index != -1) { if (stats[index].is_expensive == 0) { stats[index].is_expensive = IFNET_IS_CELLULAR(ifp); } } return index; } void mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts) { int index; tcpstat.tcps_mp_switches++; mpte->mpte_subflow_switches++; index = mptcpstats_get_index(mpte->mpte_itfstats, mpts); if (index != -1) { mpte->mpte_itfstats[index].switches++; } } /* * Flushes all recorded socket options from an MP socket. */ static void mptcp_flush_sopts(struct mptses *mpte) { struct mptopt *mpo, *tmpo; TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { mptcp_sopt_remove(mpte, mpo); mptcp_sopt_free(mpo); } VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts)); } /* * Create an MPTCP session, called as a result of opening a MPTCP socket. */ int mptcp_session_create(struct mppcb *mpp) { struct mpp_mtp *mtp; struct mppcbinfo *mppi; struct mptses *mpte; struct mptcb *mp_tp; VERIFY(mpp != NULL); mppi = mpp->mpp_pcbinfo; VERIFY(mppi != NULL); mtp = __container_of(mpp, struct mpp_mtp, mpp); mpte = &mtp->mpp_ses; mp_tp = &mtp->mtcb; /* MPTCP Multipath PCB Extension */ bzero(mpte, sizeof(*mpte)); VERIFY(mpp->mpp_pcbe == NULL); mpp->mpp_pcbe = mpte; mpte->mpte_mppcb = mpp; mpte->mpte_mptcb = mp_tp; TAILQ_INIT(&mpte->mpte_sopts); TAILQ_INIT(&mpte->mpte_subflows); mpte->mpte_associd = SAE_ASSOCID_ANY; mpte->mpte_connid_last = SAE_CONNID_ANY; mptcp_init_urgency_timer(mpte); mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0]; mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE; if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) { mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port); } mpte->mpte_last_cellicon_set = tcp_now; /* MPTCP Protocol Control Block */ bzero(mp_tp, sizeof(*mp_tp)); mp_tp->mpt_mpte = mpte; mp_tp->mpt_state = MPTCPS_CLOSED; DTRACE_MPTCP1(session__create, struct mppcb *, mpp); return 0; } struct sockaddr * mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4) { if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) { return SA(&mpte->mpte_sub_dst_v6); } if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) { return SA(&mpte->mpte_sub_dst_v4); } /* The interface has neither IPv4 nor IPv6 routes. Give our best guess, * meaning we prefer IPv6 over IPv4. */ if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) { return SA(&mpte->mpte_sub_dst_v6); } if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) { return SA(&mpte->mpte_sub_dst_v4); } /* We don't yet have a unicast IP */ return NULL; } static void mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell, uint64_t *cellbytes, uint64_t *allbytes) { int64_t mycellbytes = 0; uint64_t myallbytes = 0; int i; for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) { if (mpte->mpte_itfstats[i].is_expensive) { mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes; mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes; } myallbytes += mpte->mpte_itfstats[i].mpis_txbytes; myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes; } if (initial_cell) { mycellbytes -= mpte->mpte_init_txbytes; mycellbytes -= mpte->mpte_init_rxbytes; } if (mycellbytes < 0) { os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes); *cellbytes = 0; *allbytes = 0; } else { *cellbytes = mycellbytes; *allbytes = myallbytes; } } static void mptcpstats_session_wrapup(struct mptses *mpte) { boolean_t cell = mpte->mpte_initial_cell; switch (mpte->mpte_svctype) { case MPTCP_SVCTYPE_HANDOVER: if (mpte->mpte_flags & MPTE_FIRSTPARTY) { tcpstat.tcps_mptcp_fp_handover_attempt++; if (cell && mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_fp_handover_success_cell++; if (mpte->mpte_used_wifi) { tcpstat.tcps_mptcp_handover_wifi_from_cell++; } } else if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_fp_handover_success_wifi++; if (mpte->mpte_used_cell) { tcpstat.tcps_mptcp_handover_cell_from_wifi++; } } } else { tcpstat.tcps_mptcp_handover_attempt++; if (cell && mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_handover_success_cell++; if (mpte->mpte_used_wifi) { tcpstat.tcps_mptcp_handover_wifi_from_cell++; } } else if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_handover_success_wifi++; if (mpte->mpte_used_cell) { tcpstat.tcps_mptcp_handover_cell_from_wifi++; } } } if (mpte->mpte_handshake_success) { uint64_t cellbytes; uint64_t allbytes; mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes); tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes; tcpstat.tcps_mptcp_handover_all_bytes += allbytes; } break; case MPTCP_SVCTYPE_INTERACTIVE: if (mpte->mpte_flags & MPTE_FIRSTPARTY) { tcpstat.tcps_mptcp_fp_interactive_attempt++; if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_fp_interactive_success++; if (!cell && mpte->mpte_used_cell) { tcpstat.tcps_mptcp_interactive_cell_from_wifi++; } } } else { tcpstat.tcps_mptcp_interactive_attempt++; if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_interactive_success++; if (!cell && mpte->mpte_used_cell) { tcpstat.tcps_mptcp_interactive_cell_from_wifi++; } } } if (mpte->mpte_handshake_success) { uint64_t cellbytes; uint64_t allbytes; mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes); tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes; tcpstat.tcps_mptcp_interactive_all_bytes += allbytes; } break; case MPTCP_SVCTYPE_AGGREGATE: if (mpte->mpte_flags & MPTE_FIRSTPARTY) { tcpstat.tcps_mptcp_fp_aggregate_attempt++; if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_fp_aggregate_success++; } } else { tcpstat.tcps_mptcp_aggregate_attempt++; if (mpte->mpte_handshake_success) { tcpstat.tcps_mptcp_aggregate_success++; } } if (mpte->mpte_handshake_success) { uint64_t cellbytes; uint64_t allbytes; mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes); tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes; tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes; } break; } if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) { tcpstat.tcps_mptcp_back_to_wifi++; } if (mpte->mpte_triggered_cell) { tcpstat.tcps_mptcp_triggered_cell++; } } /* * Destroy an MPTCP session. */ static void mptcp_session_destroy(struct mptses *mpte) { struct mptcb *mp_tp = mpte->mpte_mptcb; VERIFY(mp_tp != NULL); VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0); mptcpstats_session_wrapup(mpte); mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments); mptcp_flush_sopts(mpte); if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) { kfree_data(mpte->mpte_itfinfo, sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size); } mpte->mpte_itfinfo = NULL; mptcp_freeq(mp_tp); m_freem_list(mpte->mpte_reinjectq); os_log(mptcp_log_handle, "%s - %lx: Destroying session\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } boolean_t mptcp_ok_to_create_subflows(struct mptcb *mp_tp) { return mp_tp->mpt_state >= MPTCPS_ESTABLISHED && mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 && !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP); } static int mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, const struct in_addr *addrv4) { static const struct in6_addr well_known_prefix = { .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, }; const char *ptrv4 = (const char *)addrv4; char *ptr = (char *)addr; if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast return -1; } /* Check for the well-known prefix */ if (len == NAT64_PREFIX_LEN_96 && IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) { if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space return -1; } } switch (len) { case NAT64_PREFIX_LEN_96: memcpy(ptr + 12, ptrv4, 4); break; case NAT64_PREFIX_LEN_64: memcpy(ptr + 9, ptrv4, 4); break; case NAT64_PREFIX_LEN_56: memcpy(ptr + 7, ptrv4, 1); memcpy(ptr + 9, ptrv4 + 1, 3); break; case NAT64_PREFIX_LEN_48: memcpy(ptr + 6, ptrv4, 2); memcpy(ptr + 9, ptrv4 + 2, 2); break; case NAT64_PREFIX_LEN_40: memcpy(ptr + 5, ptrv4, 3); memcpy(ptr + 9, ptrv4 + 3, 1); break; case NAT64_PREFIX_LEN_32: memcpy(ptr + 4, ptrv4, 4); break; default: panic("NAT64-prefix len is wrong: %u", len); } return 0; } static void mptcp_trigger_cell_bringup(struct mptses *mpte) { struct socket *mp_so = mptetoso(mpte); if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) { uuid_string_t uuidstr; int err; socket_unlock(mp_so, 0); err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid, TRUE); socket_lock(mp_so, 0); if (err == 0) { mpte->mpte_triggered_cell = 1; } uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr); os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err); } else { os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } } static boolean_t mptcp_subflow_disconnecting(struct mptsub *mpts) { if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) { return true; } if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) { return true; } if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) { return true; } return false; } /* * In Handover mode, only create cell subflow if * - Symptoms marked WiFi as weak: * Here, if we are sending data, then we can check the RTO-state. That is a * stronger signal of WiFi quality than the Symptoms indicator. * If however we are not sending any data, the only thing we can do is guess * and thus bring up Cell. * * - Symptoms marked WiFi as unknown: * In this state we don't know what the situation is and thus remain * conservative, only bringing up cell if there are retransmissions going on. */ static boolean_t mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp) { mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte); if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) { /* WiFi is good - don't use cell */ return false; } if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) { /* * We are in unknown state, only use Cell if we have confirmed * that WiFi is bad. */ if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) { return true; } else { return false; } } if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) { /* * WiFi is confirmed to be bad from Symptoms-Framework. * If we are sending data, check the RTOs. * Otherwise, be pessimistic and use Cell. */ if (mptetoso(mpte)->so_snd.sb_cc != 0) { if (tp->t_rxtshift >= mptcp_fail_thresh * 2) { return true; } else { return false; } } else { return true; } } return false; } void mptcp_check_subflows_and_add(struct mptses *mpte) { struct mptcb *mp_tp = mpte->mpte_mptcb; boolean_t cellular_viable = FALSE; boolean_t want_cellular = TRUE; uint32_t i; if (!mptcp_ok_to_create_subflows(mp_tp)) { os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); return; } /* Just to see if we have an IP-address available */ if (mptcp_get_session_dst(mpte, false, false) == NULL) { return; } for (i = 0; i < mpte->mpte_itfinfo_size; i++) { boolean_t need_to_ask_symptoms = FALSE, found = FALSE; struct mpt_itf_info *info; struct sockaddr_in6 nat64pre; struct sockaddr *dst; struct mptsub *mpts; struct ifnet *ifp; uint32_t ifindex; info = &mpte->mpte_itfinfo[i]; ifindex = info->ifindex; if (ifindex == IFSCOPE_NONE) { continue; } os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support, info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn); if (info->no_mptcp_support) { continue; } ifnet_head_lock_shared(); ifp = ifindex2ifnet[ifindex]; ifnet_head_done(); if (ifp == NULL) { continue; } if (IFNET_IS_CELLULAR(ifp)) { cellular_viable = TRUE; if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) { continue; } } } TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (subifp == NULL) { continue; } /* * If there is at least one functioning subflow on WiFi * and we are checking for the cell interface, then * we always need to ask symptoms for permission as * cell is triggered even if WiFi is available. */ if (!IFNET_IS_CELLULAR(subifp) && !mptcp_subflow_disconnecting(mpts) && IFNET_IS_CELLULAR(ifp)) { need_to_ask_symptoms = TRUE; } if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { os_log(mptcp_log_handle, "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover", IFNET_IS_CELLULAR(subifp), mptcp_wifi_quality_for_session(mpte), mpts->mpts_flags, tp->t_rxtshift, !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc, ifindex, subifp->if_index, tp->t_srtt >> TCP_RTT_SHIFT, tp->t_rttvar >> TCP_RTTVAR_SHIFT, tp->t_rxtcur); if (!IFNET_IS_CELLULAR(subifp) && !mptcp_subflow_disconnecting(mpts) && (mpts->mpts_flags & MPTSF_CONNECTED) && !mptcp_handover_use_cellular(mpte, tp)) { found = TRUE; /* We found a proper subflow on WiFi - no need for cell */ want_cellular = FALSE; break; } } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { uint64_t time_now = mach_continuous_time(); os_log(mptcp_log_handle, "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target, time_now, mptcp_wifi_quality_for_session(mpte), IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state, mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state); if (!IFNET_IS_CELLULAR(subifp) && !mptcp_subflow_disconnecting(mpts) && (mpte->mpte_time_target == 0 || (int64_t)(mpte->mpte_time_target - time_now) > 0 || mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) { found = TRUE; want_cellular = FALSE; break; } } if (subifp->if_index == ifindex && !mptcp_subflow_disconnecting(mpts)) { /* * We found a subflow on this interface. * No need to create a new one. */ found = TRUE; break; } } if (found) { continue; } if (need_to_ask_symptoms && !(mpte->mpte_flags & MPTE_FIRSTPARTY) && !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) && mptcp_developer_mode == 0) { mptcp_ask_symptoms(mpte); return; } dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn); if (dst->sa_family == AF_INET && !info->has_v4_conn && info->has_nat64_conn) { struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; int error, j; SOCKADDR_ZERO(&nat64pre, sizeof(struct sockaddr_in6)); error = ifnet_get_nat64prefix(ifp, nat64prefixes); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error); continue; } for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) { if (nat64prefixes[j].prefix_len != 0) { break; } } VERIFY(j < NAT64_MAX_NUM_PREFIXES); error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix, nat64prefixes[j].prefix_len, &SIN(dst)->sin_addr); if (error != 0) { os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); continue; } memcpy(&nat64pre.sin6_addr, &nat64prefixes[j].ipv6_prefix, sizeof(nat64pre.sin6_addr)); nat64pre.sin6_len = sizeof(struct sockaddr_in6); nat64pre.sin6_family = AF_INET6; nat64pre.sin6_port = SIN(dst)->sin_port; nat64pre.sin6_flowinfo = 0; nat64pre.sin6_scope_id = 0; dst = SA(&nat64pre); } if (dst->sa_family == AF_INET && !info->has_v4_conn) { continue; } if (dst->sa_family == AF_INET6 && !info->has_v6_conn) { continue; } mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL); } if (!cellular_viable && want_cellular) { /* Trigger Cell Bringup */ mptcp_trigger_cell_bringup(mpte); } } static void mptcp_remove_cell_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { continue; } os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); } return; } static void mptcp_remove_wifi_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { continue; } os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); } return; } static void mptcp_pure_handover_subflows_remove(struct mptses *mpte) { mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte); boolean_t found_working_wifi_subflow = false; boolean_t found_working_cell_subflow = false; struct mptsub *mpts; /* * Look for a subflow that is on a non-cellular interface in connected * state. * * In that case, remove all cellular subflows. * * If however there is no connected subflow */ TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; struct socket *so; struct tcpcb *tp; if (ifp == NULL) { continue; } so = mpts->mpts_socket; tp = sototcpcb(so); if (!(mpts->mpts_flags & MPTSF_CONNECTED) || tp->t_state != TCPS_ESTABLISHED || mptcp_subflow_disconnecting(mpts)) { continue; } if (IFNET_IS_CELLULAR(ifp)) { found_working_cell_subflow = true; } else { os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality); if (!mptcp_handover_use_cellular(mpte, tp)) { found_working_wifi_subflow = true; } } } /* * Couldn't find a working subflow, let's not remove those on a cellular * interface. */ os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), found_working_wifi_subflow, found_working_cell_subflow); if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) { if (found_working_cell_subflow) { mptcp_remove_wifi_subflows(mpte); } return; } mptcp_remove_cell_subflows(mpte); } static void mptcp_handover_subflows_remove(struct mptses *mpte) { mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte); boolean_t found_working_subflow = false; struct mptsub *mpts; /* * Look for a subflow that is on a non-cellular interface * and actually works (aka, no retransmission timeout). */ TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; struct socket *so; struct tcpcb *tp; if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { continue; } so = mpts->mpts_socket; tp = sototcpcb(so); if (!(mpts->mpts_flags & MPTSF_CONNECTED) || tp->t_state != TCPS_ESTABLISHED) { continue; } os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality); if (!mptcp_handover_use_cellular(mpte, tp)) { found_working_subflow = true; break; } } /* * Couldn't find a working subflow, let's not remove those on a cellular * interface. */ if (!found_working_subflow) { return; } mptcp_remove_cell_subflows(mpte); } static void mptcp_targetbased_subflows_remove(struct mptses *mpte) { uint64_t time_now = mach_continuous_time(); struct mptsub *mpts; if (mpte->mpte_time_target != 0 && (int64_t)(mpte->mpte_time_target - time_now) <= 0 && mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) { /* WiFi is bad and we are below the target - don't remove any subflows */ return; } TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { continue; } /* We have a functioning subflow on WiFi. No need for cell! */ if (mpts->mpts_flags & MPTSF_CONNECTED && !mptcp_subflow_disconnecting(mpts)) { mptcp_remove_cell_subflows(mpte); break; } } } /* * Based on the MPTCP Service-type and the state of the subflows, we * will destroy subflows here. */ void mptcp_check_subflows_and_remove(struct mptses *mpte) { if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) { return; } socket_lock_assert_owned(mptetoso(mpte)); if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { mptcp_pure_handover_subflows_remove(mpte); } if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { mptcp_handover_subflows_remove(mpte); } if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { mptcp_targetbased_subflows_remove(mpte); } } static void mptcp_remove_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) { return; } TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; boolean_t found = false; uint32_t ifindex; uint32_t i; if (mpts->mpts_flags & MPTSF_CLOSE_REQD) { mpts->mpts_flags &= ~MPTSF_CLOSE_REQD; os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, ifp ? ifp->if_index : -1); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR); continue; } if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) { continue; } if (ifp) { ifindex = ifp->if_index; } else { ifindex = mpts->mpts_ifscope; } for (i = 0; i < mpte->mpte_itfinfo_size; i++) { if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) { continue; } if (mpte->mpte_itfinfo[i].ifindex == ifindex) { if (mpts->mpts_dst.sa_family == AF_INET6 && (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) { found = true; break; } if (mpts->mpts_dst.sa_family == AF_INET && mpte->mpte_itfinfo[i].has_v4_conn) { found = true; break; } } } if (!found) { os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifindex, mpts->mpts_flags); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR); } } } static void mptcp_create_subflows(__unused void *arg) { struct mppcb *mpp; /* * Start with clearing, because we might be processing connections * while a new event comes in. */ if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) { os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__); } /* Iterate over all MPTCP connections */ lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { struct socket *mp_so = mpp->mpp_socket; struct mptses *mpte = mpp->mpp_pcbe; socket_lock(mp_so, 1); if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) || !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) { socket_unlock(mp_so, 1); continue; } VERIFY(mp_so->so_usecount > 0); mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS; mptcp_check_subflows_and_add(mpte); mptcp_remove_subflows(mpte); mp_so->so_usecount--; /* See mptcp_sched_create_subflows */ socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); } /* * We need this because we are coming from an NECP-event. This event gets posted * while holding NECP-locks. The creation of the subflow however leads us back * into NECP (e.g., to add the necp_cb and also from tcp_connect). * So, we would deadlock there as we already hold the NECP-lock. * * So, let's schedule this separately. It also gives NECP the chance to make * progress, without having to wait for MPTCP to finish its subflow creation. */ void mptcp_sched_create_subflows(struct mptses *mpte) { struct mppcb *mpp = mpte->mpte_mppcb; struct mptcb *mp_tp = mpte->mpte_mptcb; struct socket *mp_so = mpp->mpp_socket; if (!mptcp_ok_to_create_subflows(mp_tp)) { os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); return; } if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) { mp_so->so_usecount++; /* To prevent it from being free'd in-between */ mpp->mpp_flags |= MPP_CREATE_SUBFLOWS; } if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) { return; } /* Do the call in 100ms to allow NECP to schedule it on all sockets */ timeout(mptcp_create_subflows, NULL, hz / 10); } /* * Allocate an MPTCP socket option structure. */ struct mptopt * mptcp_sopt_alloc(zalloc_flags_t how) { return zalloc_flags(mptopt_zone, how | Z_ZERO); } /* * Free an MPTCP socket option structure. */ void mptcp_sopt_free(struct mptopt *mpo) { VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED)); zfree(mptopt_zone, mpo); } /* * Add a socket option to the MPTCP socket option list. */ void mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo) { socket_lock_assert_owned(mptetoso(mpte)); mpo->mpo_flags |= MPOF_ATTACHED; TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry); } /* * Remove a socket option from the MPTCP socket option list. */ void mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo) { socket_lock_assert_owned(mptetoso(mpte)); VERIFY(mpo->mpo_flags & MPOF_ATTACHED); mpo->mpo_flags &= ~MPOF_ATTACHED; TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry); } /* * Search for an existing socket option. */ struct mptopt * mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt) { struct mptopt *mpo; socket_lock_assert_owned(mptetoso(mpte)); TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) { if (mpo->mpo_level == sopt->sopt_level && mpo->mpo_name == sopt->sopt_name) { break; } } return mpo; } /* * Allocate a MPTCP subflow structure. */ static struct mptsub * mptcp_subflow_alloc(void) { return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO); } /* * Deallocate a subflow structure, called when all of the references held * on it have been released. This implies that the subflow has been deleted. */ static void mptcp_subflow_free(struct mptsub *mpts) { VERIFY(mpts->mpts_refcnt == 0); VERIFY(mpts->mpts_mpte == NULL); VERIFY(mpts->mpts_socket == NULL); free_sockaddr(mpts->mpts_src); zfree(mptsub_zone, mpts); } static void mptcp_subflow_addref(struct mptsub *mpts) { if (++mpts->mpts_refcnt == 0) { panic("%s: mpts %p wraparound refcnt", __func__, mpts); } /* NOTREACHED */ } static void mptcp_subflow_remref(struct mptsub *mpts) { if (mpts->mpts_refcnt == 0) { panic("%s: mpts %p negative refcnt", __func__, mpts); /* NOTREACHED */ } if (--mpts->mpts_refcnt > 0) { return; } /* callee will unlock and destroy lock */ mptcp_subflow_free(mpts); } static void mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so) { struct socket *mp_so = mpte->mpte_mppcb->mpp_socket; struct tcpcb *tp = sototcpcb(so); /* * From this moment on, the subflow is linked to the MPTCP-connection. * Locking,... happens now at the MPTCP-layer */ tp->t_mptcb = mpte->mpte_mptcb; so->so_flags |= SOF_MP_SUBFLOW; mp_so->so_usecount++; /* * Insert the subflow into the list, and associate the MPTCP PCB * as well as the the subflow socket. From this point on, removing * the subflow needs to be done via mptcp_subflow_del(). */ TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry); mpte->mpte_numflows++; mpts->mpts_mpte = mpte; mpts->mpts_socket = so; tp->t_mpsub = mpts; mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */ mptcp_subflow_addref(mpts); /* for subflow socket */ } static void mptcp_subflow_necp_cb(void *handle, __unused int action, __unused uint32_t interface_index, uint32_t necp_flags, bool *viable) { boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER); struct inpcb *inp = (struct inpcb *)handle; struct socket *so = inp->inp_socket; struct mptsub *mpts; struct mptses *mpte; if (low_power) { action = NECP_CLIENT_CBACTION_NONVIABLE; } if (action != NECP_CLIENT_CBACTION_NONVIABLE) { return; } /* * The socket is being garbage-collected. There is nothing to be done * here. */ if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { return; } socket_lock(so, 1); /* Check again after we acquired the lock. */ if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { goto out; } mpte = tptomptp(sototcpcb(so))->mpt_mpte; mpts = sototcpcb(so)->t_mpsub; os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power); mpts->mpts_flags |= MPTSF_CLOSE_REQD; mptcp_sched_create_subflows(mpte); if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) && viable != NULL) { *viable = 1; } out: socket_unlock(so, 1); } /* * Create an MPTCP subflow socket. */ static int mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, struct socket **so) { lck_mtx_t *subflow_mtx; struct mptopt smpo, *mpo, *tmpo; struct proc *p; struct socket *mp_so; struct mppcb *mpp; int error; *so = NULL; mp_so = mptetoso(mpte); mpp = mpsotomppcb(mp_so); p = proc_find(mp_so->last_pid); if (p == PROC_NULL) { os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid); mptcp_subflow_free(mpts); return ESRCH; } /* * Create the subflow socket (multipath subflow, non-blocking.) * * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow * socket; it will be cleared when the socket is peeled off or closed. * It also indicates to the underlying TCP to handle MPTCP options. * A multipath subflow socket implies SS_NOFDREF state. */ /* * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes * the ipi-lock. We cannot hold the socket-lock at that point. */ socket_unlock(mp_so, 0); error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p, SOCF_MPTCP, PROC_NULL); socket_lock(mp_so, 0); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); proc_rele(p); mptcp_subflow_free(mpts); return error; } /* * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock. * Which is why we also need to get the lock with pr_getlock, as after * setting the flag, socket_unlock will work on the MPTCP-level lock. */ subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0); lck_mtx_lock(subflow_mtx); /* * Must be the first thing we do, to make sure all pointers for this * subflow are set. */ mptcp_subflow_attach(mpte, mpts, *so); /* * A multipath subflow socket is used internally in the kernel, * therefore it does not have a file desciptor associated by * default. */ (*so)->so_state |= SS_NOFDREF; lck_mtx_unlock(subflow_mtx); /* prevent the socket buffers from being compressed */ (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS; (*so)->so_snd.sb_flags |= SB_NOCOMPRESS; /* Inherit preconnect and TFO data flags */ if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) { (*so)->so_flags1 |= SOF1_PRECONNECT_DATA; } if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) { (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT; } if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) { (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED; } /* Inherit uuid and create the related flow. */ if (!uuid_is_null(mpp->necp_client_uuid)) { struct mptcb *mp_tp = mpte->mpte_mptcb; sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb; /* * A note on the unlock: With MPTCP, we do multiple times a * necp_client_register_socket_flow. This is problematic, * because now the lock-ordering guarantee (first necp-locks, * then socket-locks) is no more respected. So, we need to * unlock here. */ socket_unlock(mp_so, 0); error = necp_client_register_socket_flow(mp_so->last_pid, mpp->necp_client_uuid, sotoinpcb(*so)); socket_lock(mp_so, 0); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); goto out_err; } /* Possible state-change during the unlock above */ if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT || (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) { os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags); error = EINVAL; goto out_err; } uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid); } if (mpp->inp_necp_attributes.inp_domain != NULL) { size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain); sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO); if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) { memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1); } } if (mpp->inp_necp_attributes.inp_account != NULL) { size_t string_size = strlen(mpp->inp_necp_attributes.inp_account); sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO); if (sotoinpcb(*so)->inp_necp_attributes.inp_account) { memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1); } } if (mpp->inp_necp_attributes.inp_domain_owner != NULL) { size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner); sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO); if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) { memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1); } } if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) { size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain); sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO); if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) { memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1); } } /* Needs to happen prior to the delegation! */ (*so)->last_pid = mp_so->last_pid; if (mp_so->so_flags & SOF_DELEGATED) { if (mpte->mpte_epid) { error = so_set_effective_pid(*so, mpte->mpte_epid, p, false); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); goto out_err; } } if (!uuid_is_null(mpte->mpte_euuid)) { error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); goto out_err; } } } /* inherit the other socket options */ bzero(&smpo, sizeof(smpo)); smpo.mpo_flags |= MPOF_SUBFLOW_OK; smpo.mpo_level = SOL_SOCKET; smpo.mpo_intval = 1; /* disable SIGPIPE */ smpo.mpo_name = SO_NOSIGPIPE; if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) { goto out_err; } /* find out if the subflow's source address goes away */ smpo.mpo_name = SO_NOADDRERR; if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) { goto out_err; } if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) { /* * On secondary subflows we might need to set the cell-fallback * flag (see conditions in mptcp_subflow_sosetopt). */ smpo.mpo_level = SOL_SOCKET; smpo.mpo_name = SO_MARK_CELLFALLBACK; smpo.mpo_intval = 1; if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) { goto out_err; } } /* replay setsockopt(2) on the subflow sockets for eligible options */ TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { int interim; if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) { continue; } /* * Skip those that are handled internally; these options * should not have been recorded and marked with the * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case. */ if (mpo->mpo_level == SOL_SOCKET && (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR || mpo->mpo_name == SO_KEEPALIVE)) { continue; } interim = (mpo->mpo_flags & MPOF_INTERIM); if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) { os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), mpo->mpo_intval); mptcp_sopt_remove(mpte, mpo); mptcp_sopt_free(mpo); continue; } } /* * We need to receive everything that the subflow socket has, * so use a customized socket receive function. We will undo * this when the socket is peeled off or closed. */ switch (dom) { case PF_INET: (*so)->so_proto = &mptcp_subflow_protosw; break; case PF_INET6: (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6; break; default: VERIFY(0); /* NOTREACHED */ } proc_rele(p); DTRACE_MPTCP3(subflow__create, struct mptses *, mpte, int, dom, int, error); return 0; out_err: mptcp_subflow_abort(mpts, error); proc_rele(p); return error; } /* * Close an MPTCP subflow socket. * * Note that this may be called on an embryonic subflow, and the only * thing that is guaranteed valid is the protocol-user request. */ static void mptcp_subflow_soclose(struct mptsub *mpts) { struct socket *so = mpts->mpts_socket; if (mpts->mpts_flags & MPTSF_CLOSED) { return; } VERIFY(so != NULL); VERIFY(so->so_flags & SOF_MP_SUBFLOW); VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF)); DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts, struct socket *, so, struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, struct mptses *, mpts->mpts_mpte); mpts->mpts_flags |= MPTSF_CLOSED; if (so->so_retaincnt == 0) { soclose_locked(so); return; } else { VERIFY(so->so_usecount > 0); so->so_usecount--; } return; } static void mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id) { struct tcpcb *tp = sototcpcb(so); struct mptcp_subf_auth_entry *sauth_entry; /* * The address ID of the first flow is implicitly 0. */ if (mp_tp->mpt_state == MPTCPS_CLOSED) { tp->t_local_aid = 0; } else { tp->t_local_aid = addr_id; tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW); so->so_flags |= SOF_MP_SEC_SUBFLOW; } sauth_entry = zalloc(mpt_subauth_zone); sauth_entry->msae_laddr_id = tp->t_local_aid; sauth_entry->msae_raddr_id = 0; sauth_entry->msae_raddr_rand = 0; try_again: sauth_entry->msae_laddr_rand = RandomULong(); if (sauth_entry->msae_laddr_rand == 0) { goto try_again; } LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next); } static void mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so) { struct mptcp_subf_auth_entry *sauth_entry; struct tcpcb *tp = NULL; int found = 0; tp = sototcpcb(so); if (tp == NULL) { return; } LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { if (sauth_entry->msae_laddr_id == tp->t_local_aid) { found = 1; break; } } if (found) { LIST_REMOVE(sauth_entry, msae_next); } if (found) { zfree(mpt_subauth_zone, sauth_entry); } } /* * Connect an MPTCP subflow socket. * * Note that in the pending connect case, the subflow socket may have been * bound to an interface and/or a source IP address which may no longer be * around by the time this routine is called; in that case the connect attempt * will most likely fail. */ static int mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) { char dbuf[MAX_IPv6_STR_LEN]; struct socket *mp_so, *so; struct mptcb *mp_tp; struct sockaddr *dst; struct proc *p; int af, error, dport; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; af = mpts->mpts_dst.sa_family; dst = &mpts->mpts_dst; VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING); VERIFY(mpts->mpts_socket != NULL); VERIFY(af == AF_INET || af == AF_INET6); if (af == AF_INET) { inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf)); dport = ntohs(SIN(dst)->sin_port); } else { inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf)); dport = ntohs(SIN6(dst)->sin6_port); } os_log(mptcp_log_handle, "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)); p = proc_find(mp_so->last_pid); if (p == PROC_NULL) { os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid); return ESRCH; } mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING; mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last); /* connect the subflow socket */ error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst, p, mpts->mpts_ifscope, mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL); mpts->mpts_iss = sototcpcb(so)->iss; /* See tcp_connect_complete */ if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) { mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd; } /* Allocate a unique address id per subflow */ mpte->mpte_addrid_last++; if (mpte->mpte_addrid_last == 0) { mpte->mpte_addrid_last++; } proc_rele(p); DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte, struct mptsub *, mpts, int, error); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope); } return error; } static int mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, uint32_t rseq, uint16_t dlen, uint8_t dfin) { struct mptsub *mpts = sototcpcb(so)->t_mpsub; if (m_pktlen(m) == 0) { return 0; } if (!(m->m_flags & M_PKTHDR)) { return 0; } if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { if (off && (dsn != m->m_pkthdr.mp_dsn || rseq != m->m_pkthdr.mp_rseq || dlen != m->m_pkthdr.mp_rlen || dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) { os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn, rseq, m->m_pkthdr.mp_rseq, dlen, m->m_pkthdr.mp_rlen, dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); return -1; } } /* If mbuf is beyond right edge of the mapping, we need to split */ if (m_pktlen(m) > dlen - dfin - off) { struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT); if (new == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), dlen, dfin, off, m_pktlen(m), mpts->mpts_connid); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); return -1; } m->m_next = new; sballoc(&so->so_rcv, new); /* Undo, as sballoc will add to it as well */ so->so_rcv.sb_cc -= new->m_len; if (so->so_rcv.sb_mbtail == m) { so->so_rcv.sb_mbtail = new; } } m->m_pkthdr.pkt_flags |= PKTF_MPTCP; m->m_pkthdr.mp_dsn = dsn + off; m->m_pkthdr.mp_rseq = rseq + off; VERIFY(m_pktlen(m) < UINT16_MAX); m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m); /* Only put the DATA_FIN-flag on the last mbuf of this mapping */ if (dfin) { if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) { m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN; } else { m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN; } } mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED; return 0; } /* * Update the pid, upid, uuid of the subflow so, based on parent so */ static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so) { if (so->last_pid != mp_so->last_pid || so->last_upid != mp_so->last_upid) { so->last_upid = mp_so->last_upid; so->last_pid = mp_so->last_pid; uuid_copy(so->last_uuid, mp_so->last_uuid); } so_update_policy(so); } /* * MPTCP subflow socket receive routine, derived from soreceive(). */ static int mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { #pragma unused(uio) struct socket *mp_so; struct mptses *mpte; struct mptcb *mp_tp; int flags, error = 0; struct mbuf *m, **mp = mp0; struct tcpcb *tp = sototcpcb(so); mpte = tptomptp(sototcpcb(so))->mpt_mpte; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED); #ifdef MORE_LOCKING_DEBUG if (so->so_usecount == 1) { panic("%s: so=%x no other reference on socket", __func__, so); /* NOTREACHED */ } #endif /* * We return all that is there in the subflow's socket receive buffer * to the MPTCP layer, so we require that the caller passes in the * expected parameters. */ if (mp == NULL || controlp != NULL) { return EINVAL; } *mp = NULL; if (psa != NULL) { *psa = NULL; } if (flagsp != NULL) { flags = *flagsp & ~MSG_EOR; } else { flags = 0; } if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) { return EOPNOTSUPP; } flags |= (MSG_DONTWAIT | MSG_NBIO); /* * If a recv attempt is made on a previously-accepted socket * that has been marked as inactive (disconnected), reject * the request. */ if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; error = ENOTCONN; /* * This socket should have been disconnected and flushed * prior to being returned from sodefunct(); there should * be no data on its receive list, so panic otherwise. */ if (so->so_state & SS_DEFUNCT) { sb_empty_assert(sb, __func__); } return error; } /* * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) * and if so just return to the caller. This could happen when * soreceive() is called by a socket upcall function during the * time the socket is freed. The socket buffer would have been * locked across the upcall, therefore we cannot put this thread * to sleep (else we will deadlock) or return EWOULDBLOCK (else * we may livelock), because the lock on the socket buffer will * only be released when the upcall routine returns to its caller. * Because the socket has been officially closed, there can be * no further read on it. * * A multipath subflow socket would have its SS_NOFDREF set by * default, so check for SOF_MP_SUBFLOW socket flag; when the * socket is closed for real, SOF_MP_SUBFLOW would be cleared. */ if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { return 0; } /* * For consistency with soreceive() semantics, we need to obey * SB_LOCK in case some other code path has locked the buffer. */ error = sblock(&so->so_rcv, 0); if (error != 0) { return error; } m = so->so_rcv.sb_mb; if (m == NULL) { /* * Panic if we notice inconsistencies in the socket's * receive list; both sb_mb and sb_cc should correctly * reflect the contents of the list, otherwise we may * end up with false positives during select() or poll() * which could put the application in a bad state. */ SB_MB_CHECK(&so->so_rcv); if (so->so_error != 0) { error = so->so_error; so->so_error = 0; goto release; } if (so->so_state & SS_CANTRCVMORE) { goto release; } if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) { error = ENOTCONN; goto release; } /* * MSG_DONTWAIT is implicitly defined and this routine will * never block, so return EWOULDBLOCK when there is nothing. */ error = EWOULDBLOCK; goto release; } mptcp_update_last_owner(so, mp_so); SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); while (m != NULL) { int dlen = 0, error_out = 0, off = 0; uint8_t dfin = 0; struct mbuf *start = m; uint64_t dsn; uint32_t sseq; uint16_t orig_dlen; uint16_t csum; VERIFY(m->m_nextpkt == NULL); if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { fallback: /* Just move mbuf to MPTCP-level */ sbfree(&so->so_rcv, m); if (mp != NULL) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } if (m != NULL) { so->so_rcv.sb_lastrecord = m; } else { SB_EMPTY_FIXUP(&so->so_rcv); } continue; } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { struct mptsub *mpts = sototcpcb(so)->t_mpsub; boolean_t found_mapping = false; int parsed_length = 0; struct mbuf *m_iter; /* * No MPTCP-option in the header. Either fallback or * wait for additional mappings. */ if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) { /* data arrived without a DSS option mapping */ /* initial subflow can fallback right after SYN handshake */ if (mpts->mpts_flags & MPTSF_INITIAL_SUB) { mptcp_notify_mpfail(so); goto fallback; } else { os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); error = EIO; *mp0 = NULL; goto release; } } /* Thus, let's look for an mbuf with the mapping */ m_iter = m->m_next; parsed_length = m->m_len; while (m_iter != NULL && parsed_length < UINT16_MAX) { if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) { parsed_length += m_iter->m_len; m_iter = m_iter->m_next; continue; } found_mapping = true; /* Found an mbuf with a DSS-mapping */ orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen; dsn = m_iter->m_pkthdr.mp_dsn; sseq = m_iter->m_pkthdr.mp_rseq; csum = m_iter->m_pkthdr.mp_csum; if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) { dfin = 1; dlen--; } break; } if (!found_mapping && parsed_length < UINT16_MAX) { /* Mapping not yet present, we can wait! */ if (*mp0 == NULL) { error = EWOULDBLOCK; } goto release; } else if (!found_mapping && parsed_length >= UINT16_MAX) { os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid); /* Received 64KB without DSS-mapping. We should kill the subflow */ soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); error = EIO; *mp0 = NULL; goto release; } } else { orig_dlen = dlen = m->m_pkthdr.mp_rlen; dsn = m->m_pkthdr.mp_dsn; sseq = m->m_pkthdr.mp_rseq; csum = m->m_pkthdr.mp_csum; if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) { dfin = 1; dlen--; } } /* Now, see if we need to remove previous packets */ if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) { /* Ok, there is data in there that we don't need - let's throw it away! */ int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc); sbdrop(&so->so_rcv, totrim); m = so->so_rcv.sb_mb; } /* * Check if the full mapping is now present */ if ((int)so->so_rcv.sb_cc < dlen) { if (*mp0 == NULL) { error = EWOULDBLOCK; } goto release; } /* Now, get the full mapping */ off = 0; while (dlen > 0) { if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) { error_out = 1; error = EIO; dlen = 0; *mp0 = NULL; break; } dlen -= m->m_len; off += m->m_len; sbfree(&so->so_rcv, m); if (mp != NULL) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } ASSERT(dlen == 0 || m); if (dlen != 0 && m == NULL) { /* "try" to gracefully recover on customer builds */ error_out = 1; error = EIO; dlen = 0; *mp0 = NULL; SB_EMPTY_FIXUP(&so->so_rcv); soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); break; } } VERIFY(dlen == 0); if (m != NULL) { so->so_rcv.sb_lastrecord = m; } else { SB_EMPTY_FIXUP(&so->so_rcv); } if (error_out) { goto release; } if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) { error = EIO; *mp0 = NULL; goto release; } SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); } DTRACE_MPTCP3(subflow__receive, struct socket *, so, struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd); if (flagsp != NULL) { *flagsp |= flags; } release: sbunlock(&so->so_rcv, TRUE); return error; } /* * MPTCP subflow socket send routine, derived from sosend(). */ static int mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags) { struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte); boolean_t en_tracing = FALSE, proc_held = FALSE; struct proc *p = current_proc(); int en_tracing_val; int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */ int error; VERIFY(control == NULL); VERIFY(addr == NULL); VERIFY(uio == NULL); VERIFY(flags == 0); VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0); VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX); VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP); /* * trace if tracing & network (vs. unix) sockets & and * non-loopback */ if (ENTR_SHOULDTRACE && (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) { struct inpcb *inp = sotoinpcb(so); if (inp->inp_last_outifp != NULL && !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { en_tracing = TRUE; en_tracing_val = top->m_pkthdr.len; KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START, (unsigned long)VM_KERNEL_ADDRPERM(so), ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0), (int64_t)en_tracing_val); } } mptcp_update_last_owner(so, mp_so); if (mp_so->last_pid != proc_pid(p)) { p = proc_find(mp_so->last_pid); if (p == PROC_NULL) { p = current_proc(); } else { proc_held = TRUE; } } #if NECP inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0); #endif /* NECP */ error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked); if (error) { goto out; } error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p); top = NULL; out: if (top != NULL) { m_freem(top); } if (proc_held) { proc_rele(p); } soclearfastopen(so); if (en_tracing) { KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END, (unsigned long)VM_KERNEL_ADDRPERM(so), ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0), (int64_t)en_tracing_val); } return error; } /* * Subflow socket write upcall. * * Called when the associated subflow socket posted a read event. */ static void mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf) { #pragma unused(so, waitf) struct mptsub *mpts = arg; struct mptses *mpte = mpts->mpts_mpte; VERIFY(mpte != NULL); if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) { if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP; } return; } mptcp_output(mpte); } /* * Subflow socket control event upcall. */ static void mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events) { #pragma unused(so) struct mptsub *mpts = arg; struct mptses *mpte = mpts->mpts_mpte; socket_lock_assert_owned(mptetoso(mpte)); if ((mpts->mpts_evctl & events) == events) { return; } mpts->mpts_evctl |= events; if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP; return; } mptcp_subflow_workloop(mpte); } /* * Establish an initial MPTCP connection (if first subflow and not yet * connected), or add a subflow to an existing MPTCP connection. */ int mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid) { struct socket *mp_so, *so = NULL; struct mptcb *mp_tp; struct mptsub *mpts = NULL; int af, error = 0; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; socket_lock_assert_owned(mp_so); if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { /* If the remote end sends Data FIN, refuse subflow adds */ os_log_error(mptcp_log_handle, "%s - %lx: state %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state); error = ENOTCONN; goto out_err; } if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) { error = EOVERFLOW; goto out_err; } mpts = mptcp_subflow_alloc(); if (mpts == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); error = ENOMEM; goto out_err; } if (src) { if (src->sa_family != AF_INET && src->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto out_err; } if (src->sa_family == AF_INET && src->sa_len != sizeof(struct sockaddr_in)) { error = EINVAL; goto out_err; } if (src->sa_family == AF_INET6 && src->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto out_err; } mpts->mpts_src = SA(alloc_sockaddr(src->sa_len, Z_WAITOK | Z_NOFAIL)); SOCKADDR_COPY(src, mpts->mpts_src, src->sa_len); } if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto out_err; } if (dst->sa_family == AF_INET && dst->sa_len != sizeof(mpts->__mpts_dst_v4)) { error = EINVAL; goto out_err; } if (dst->sa_family == AF_INET6 && dst->sa_len != sizeof(mpts->__mpts_dst_v6)) { error = EINVAL; goto out_err; } SOCKADDR_COPY(dst, &mpts->mpts_dst, dst->sa_len); af = mpts->mpts_dst.sa_family; ifnet_head_lock_shared(); if ((ifscope > (unsigned)if_index)) { ifnet_head_done(); error = ENXIO; goto out_err; } ifnet_head_done(); mpts->mpts_ifscope = ifscope; /* create the subflow socket */ if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) { /* * Returning (error) and not cleaning up, because up to here * all we did is creating mpts. * * And the contract is that the call to mptcp_subflow_socreate, * moves ownership of mpts to mptcp_subflow_socreate. */ return error; } /* * We may be called from within the kernel. Still need to account this * one to the real app. */ mptcp_update_last_owner(mpts->mpts_socket, mp_so); /* * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and * -1 (SAE_CONNID_ALL). */ mpte->mpte_connid_last++; if (mpte->mpte_connid_last == SAE_CONNID_ALL || mpte->mpte_connid_last == SAE_CONNID_ANY) { mpte->mpte_connid_last++; } mpts->mpts_connid = mpte->mpte_connid_last; mpts->mpts_rel_seq = 1; /* Allocate a unique address id per subflow */ mpte->mpte_addrid_last++; if (mpte->mpte_addrid_last == 0) { mpte->mpte_addrid_last++; } /* register for subflow socket read/write events */ sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1); /* Register for subflow socket control events */ sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts, SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE | SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO | SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR); /* sanity check */ VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING))); /* * Indicate to the TCP subflow whether or not it should establish * the initial MPTCP connection, or join an existing one. Fill * in the connection request structure with additional info needed * by the underlying TCP (to be used in the TCP options, etc.) */ if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) { mpts->mpts_flags |= MPTSF_INITIAL_SUB; if (mp_tp->mpt_state == MPTCPS_CLOSED) { mptcp_init_local_parms(mpte, dst); } soisconnecting(mp_so); /* If fastopen is requested, set state in mpts */ if (so->so_flags1 & SOF1_PRECONNECT_DATA) { mpts->mpts_flags |= MPTSF_TFO_REQD; } } else { if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) { mpts->mpts_flags |= MPTSF_CONNECT_PENDING; } } mpts->mpts_flags |= MPTSF_CONNECTING; /* connect right away if first attempt, or if join can be done now */ if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) { error = mptcp_subflow_soconnectx(mpte, mpts); } if (error) { goto out_err_close; } if (pcid) { *pcid = mpts->mpts_connid; } return 0; out_err_close: mptcp_subflow_abort(mpts, error); return error; out_err: if (mpts) { mptcp_subflow_free(mpts); } return error; } void mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts) { int index = mptcpstats_get_index(stats, mpts); if (index != -1) { struct inpcb *inp = sotoinpcb(mpts->mpts_socket); stats[index].mpis_txbytes += inp->inp_stat->txbytes; stats[index].mpis_rxbytes += inp->inp_stat->rxbytes; stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes; stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes; stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes; stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes; stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes; stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes; } } /* * Delete/remove a subflow from an MPTCP. The underlying subflow socket * will no longer be accessible after a subflow is deleted, thus this * should occur only after the subflow socket has been disconnected. */ void mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts) { struct socket *mp_so = mptetoso(mpte); struct socket *so = mpts->mpts_socket; struct tcpcb *tp = sototcpcb(so); socket_lock_assert_owned(mp_so); VERIFY(mpts->mpts_mpte == mpte); VERIFY(mpte->mpte_numflows != 0); VERIFY(mp_so->so_usecount > 0); mptcpstats_update(mpte->mpte_itfstats, mpts); mptcp_unset_cellicon(mpte, mpts, 1); mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes; mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes; TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry); mpte->mpte_numflows--; if (mpte->mpte_active_sub == mpts) { mpte->mpte_active_sub = NULL; } /* * Drop references held by this subflow socket; there * will be no further upcalls made from this point. */ sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0); sock_catchevents_locked(so, NULL, NULL, 0); mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so); mp_so->so_usecount--; /* for subflow socket */ mpts->mpts_mpte = NULL; mpts->mpts_socket = NULL; mptcp_subflow_remref(mpts); /* for MPTCP subflow list */ mptcp_subflow_remref(mpts); /* for subflow socket */ so->so_flags &= ~SOF_MP_SUBFLOW; tp->t_mptcb = NULL; tp->t_mpsub = NULL; } void mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts) { struct socket *so = mpts->mpts_socket; struct mptcb *mp_tp = mpte->mpte_mptcb; int send_dfin = 0; if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) { send_dfin = 1; } if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && (so->so_state & SS_ISCONNECTED)) { if (send_dfin) { mptcp_send_dfin(so); } soshutdownlock(so, SHUT_WR); } } static void mptcp_subflow_abort(struct mptsub *mpts, int error) { struct socket *so = mpts->mpts_socket; struct tcpcb *tp = sototcpcb(so); if (mpts->mpts_flags & MPTSF_DISCONNECTED) { return; } if (tp->t_state != TCPS_CLOSED) { tcp_drop(tp, error); } mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED); } /* * Disconnect a subflow socket. */ void mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts) { struct socket *so, *mp_so; struct mptcb *mp_tp; int send_dfin = 0; so = mpts->mpts_socket; mp_tp = mpte->mpte_mptcb; mp_so = mptetoso(mpte); socket_lock_assert_owned(mp_so); if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) { return; } mptcp_unset_cellicon(mpte, mpts, 1); mpts->mpts_flags |= MPTSF_DISCONNECTING; if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) { send_dfin = 1; } if (mp_so->so_flags & SOF_DEFUNCT) { errno_t ret; ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE); if (ret == 0) { ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); if (ret != 0) { os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); } } else { os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); } } if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && (so->so_state & SS_ISCONNECTED)) { if (send_dfin) { mptcp_send_dfin(so); } (void) soshutdownlock(so, SHUT_RD); (void) soshutdownlock(so, SHUT_WR); (void) sodisconnectlocked(so); } /* * Generate a disconnect event for this subflow socket, in case * the lower layer doesn't do it; this is needed because the * subflow socket deletion relies on it. */ mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED); } /* * Subflow socket input. */ static void mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) { struct socket *mp_so = mptetoso(mpte); struct mbuf *m = NULL; struct socket *so; int error, wakeup = 0; VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT)); mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT; DTRACE_MPTCP2(subflow__input, struct mptses *, mpte, struct mptsub *, mpts); if (!(mpts->mpts_flags & MPTSF_CONNECTED)) { goto out; } so = mpts->mpts_socket; error = sock_receive_internal(so, NULL, &m, 0, NULL); if (error != 0 && error != EWOULDBLOCK) { os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error); if (error == ENODATA) { /* * Don't ignore ENODATA so as to discover * nasty middleboxes. */ mp_so->so_error = ENODATA; wakeup = 1; goto out; } } /* In fallback, make sure to accept data on all but one subflow */ if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) && !(mpts->mpts_flags & MPTSF_ACTIVE)) { m_freem(m); goto out; } if (m != NULL) { if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) { mptcp_set_cellicon(mpte, mpts); mpte->mpte_used_cell = 1; } else { /* * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't * explicitly set the cellicon, then we unset it again. */ if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) { mptcp_unset_cellicon(mpte, NULL, 1); } mpte->mpte_used_wifi = 1; } mptcp_input(mpte, m); } out: if (wakeup) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP; } mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT); } void mptcp_handle_input(struct socket *so) { struct mptsub *mpts, *tmpts; struct mptses *mpte; if (!(so->so_flags & SOF_MP_SUBFLOW)) { return; } mpts = sototcpcb(so)->t_mpsub; mpte = mpts->mpts_mpte; socket_lock_assert_owned(mptetoso(mpte)); if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) { if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP; } return; } mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE; TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { if (mpts->mpts_socket->so_usecount == 0) { /* Will be removed soon by tcp_garbage_collect */ continue; } mptcp_subflow_addref(mpts); mpts->mpts_socket->so_usecount++; mptcp_subflow_input(mpte, mpts); mptcp_subflow_remref(mpts); /* ours */ VERIFY(mpts->mpts_socket->so_usecount != 0); mpts->mpts_socket->so_usecount--; } mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE); } static boolean_t mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so) { struct mbuf *so_m = so->so_snd.sb_mb; uint64_t dsn = m->m_pkthdr.mp_dsn; while (so_m) { VERIFY(so_m->m_flags & M_PKTHDR); VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP); /* Part of the segment is covered, don't reinject here */ if (so_m->m_pkthdr.mp_dsn <= dsn && so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) { return TRUE; } so_m = so_m->m_next; } return FALSE; } /* * Subflow socket output. * * Called for sending data from MPTCP to the underlying subflow socket. */ int mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags) { struct mptcb *mp_tp = mpte->mpte_mptcb; struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL; struct socket *mp_so, *so; struct tcpcb *tp; uint64_t mpt_dsn = 0, off = 0; int sb_cc = 0, error = 0, wakeup = 0; uint16_t dss_csum; uint16_t tot_sent = 0; boolean_t reinjected = FALSE; mp_so = mptetoso(mpte); so = mpts->mpts_socket; tp = sototcpcb(so); socket_lock_assert_owned(mp_so); VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT)); mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT; VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so))); VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) || (mpts->mpts_flags & MPTSF_MP_DEGRADED) || (mpts->mpts_flags & MPTSF_TFO_REQD)); VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0); DTRACE_MPTCP2(subflow__output, struct mptses *, mpte, struct mptsub *, mpts); /* Remove Addr Option is not sent reliably as per I-D */ if (mpte->mpte_flags & MPTE_SND_REM_ADDR) { tp->t_rem_aid = mpte->mpte_lost_aid; tp->t_mpflags |= TMPF_SND_REM_ADDR; mpte->mpte_flags &= ~MPTE_SND_REM_ADDR; } /* * The mbuf chains containing the metadata (as well as pointing to * the user data sitting at the MPTCP output queue) would then be * sent down to the subflow socket. * * Some notes on data sequencing: * * a. Each mbuf must be a M_PKTHDR. * b. MPTCP metadata is stored in the mptcp_pktinfo structure * in the mbuf pkthdr structure. * c. Each mbuf containing the MPTCP metadata must have its * pkt_flags marked with the PKTF_MPTCP flag. */ if (mpte->mpte_reinjectq) { sb_mb = mpte->mpte_reinjectq; } else { sb_mb = mp_so->so_snd.sb_mb; } if (sb_mb == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1); /* Fix it to prevent looping */ if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) { mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; } goto out; } VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP); if (sb_mb->m_pkthdr.mp_rlen == 0 && !(so->so_state & SS_ISCONNECTED) && (so->so_flags1 & SOF1_PRECONNECT_DATA)) { tp->t_mpflags |= TMPF_TFO_REQUEST; /* Opting to call pru_send as no mbuf at subflow level */ error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL, NULL, current_proc()); goto done_sending; } mpt_dsn = sb_mb->m_pkthdr.mp_dsn; /* First, drop acknowledged data */ if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) { os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier " "dsn %u suna %u reinject? %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq); if (mpte->mpte_reinjectq) { mptcp_clean_reinjectq(mpte); } else { uint64_t len = 0; len = mp_tp->mpt_snduna - mpt_dsn; sbdrop(&mp_so->so_snd, (int)len); wakeup = 1; } } /* Check again because of above sbdrop */ if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) { os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto out; } /* * In degraded mode, we don't receive data acks, so force free * mbufs less than snd_nxt */ if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) && mp_so->so_snd.sb_mb) { mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn; if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) { uint64_t len = 0; len = mp_tp->mpt_snduna - mpt_dsn; sbdrop(&mp_so->so_snd, (int)len); wakeup = 1; os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna); } } if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) { mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC; so->so_flags1 |= SOF1_POST_FALLBACK_SYNC; } /* * Adjust the top level notion of next byte used for retransmissions * and sending FINs. */ if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) { mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; } /* Now determine the offset from which to start transmitting data */ if (mpte->mpte_reinjectq) { sb_mb = mpte->mpte_reinjectq; } else { dont_reinject: sb_mb = mp_so->so_snd.sb_mb; } if (sb_mb == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto out; } if (sb_mb == mpte->mpte_reinjectq) { sb_cc = sb_mb->m_pkthdr.mp_rlen; off = 0; if (mptcp_search_seq_in_sub(sb_mb, so)) { if (mptcp_can_send_more(mp_tp, TRUE)) { goto dont_reinject; } error = ECANCELED; goto out; } reinjected = TRUE; } else if (flags & MPTCP_SUBOUT_PROBING) { sb_cc = sb_mb->m_pkthdr.mp_rlen; off = 0; } else { sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd); /* * With TFO, there might be no data at all, thus still go into this * code-path here. */ if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) || MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) { off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna; sb_cc -= off; } else { os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax); goto out; } } sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so)); if (sb_cc <= 0) { os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax, mptcp_subflow_cwnd_space(so)); } sb_cc = min(sb_cc, UINT16_MAX); /* * Create a DSN mapping for the data we are about to send. It all * has the same mapping. */ if (reinjected) { mpt_dsn = sb_mb->m_pkthdr.mp_dsn; } else { mpt_dsn = mp_tp->mpt_snduna + off; } mpt_mbuf = sb_mb; while (mpt_mbuf && reinjected == FALSE && (mpt_mbuf->m_pkthdr.mp_rlen == 0 || mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) { off -= mpt_mbuf->m_pkthdr.mp_rlen; mpt_mbuf = mpt_mbuf->m_next; } VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)); head = tail = NULL; while (tot_sent < sb_cc) { int32_t mlen; mlen = mpt_mbuf->m_len; mlen -= off; mlen = MIN(mlen, sb_cc - tot_sent); if (mlen < 0) { os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen, (uint32_t)off, sb_cc, tot_sent); goto out; } if (mlen == 0) { goto next; } m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR); if (m == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); error = ENOBUFS; break; } /* Create a DSN mapping for the data (m_copym does it) */ VERIFY(m->m_flags & M_PKTHDR); VERIFY(m->m_next == NULL); m->m_pkthdr.pkt_flags |= PKTF_MPTCP; m->m_pkthdr.pkt_flags &= ~PKTF_MPSO; m->m_pkthdr.mp_dsn = mpt_dsn; m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq; m->m_pkthdr.len = mlen; if (head == NULL) { head = tail = m; } else { tail->m_next = m; tail = m; } tot_sent += mlen; off = 0; next: mpt_mbuf = mpt_mbuf->m_next; } if (reinjected) { if (sb_cc < sb_mb->m_pkthdr.mp_rlen) { struct mbuf *n = sb_mb; while (n) { n->m_pkthdr.mp_dsn += sb_cc; n->m_pkthdr.mp_rlen -= sb_cc; n = n->m_next; } m_adj(sb_mb, sb_cc); } else { mpte->mpte_reinjectq = sb_mb->m_nextpkt; m_freem(sb_mb); } } if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) { dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq, tot_sent); } /* Now, let's update rel-seq and the data-level length */ mpts->mpts_rel_seq += tot_sent; m = head; while (m) { if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) { m->m_pkthdr.mp_csum = dss_csum; } m->m_pkthdr.mp_rlen = tot_sent; m = m->m_next; } if (head != NULL) { if ((mpts->mpts_flags & MPTSF_TFO_REQD) && (tp->t_tfo_stats == 0)) { tp->t_mpflags |= TMPF_TFO_REQUEST; } error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0); head = NULL; } done_sending: if (error == 0 || (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) { uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent; if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) { tcpstat.tcps_mp_num_probes++; if ((uint32_t)tot_sent < mpts->mpts_maxseg) { mpts->mpts_probecnt += 1; } else { mpts->mpts_probecnt += tot_sent / mpts->mpts_maxseg; } } if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) { if (MPTCP_DATASEQ_HIGH32(new_sndnxt) > MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) { mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN; } mp_tp->mpt_sndnxt = new_sndnxt; } mptcp_cancel_timer(mp_tp, MPTT_REXMT); /* Must be here as mptcp_can_send_more() checks for this */ soclearfastopen(mp_so); if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) { mptcp_set_cellicon(mpte, mpts); mpte->mpte_used_cell = 1; } else { /* * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't * explicitly set the cellicon, then we unset it again. */ if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) { mptcp_unset_cellicon(mpte, NULL, 1); } mpte->mpte_used_wifi = 1; } /* * Don't propagate EWOULDBLOCK - it's already taken care of * in mptcp_usr_send for TFO. */ error = 0; } else { /* We need to revert our change to mpts_rel_seq */ mpts->mpts_rel_seq -= tot_sent; os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat); } out: if (head != NULL) { m_freem(head); } if (wakeup) { mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP; } mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT); return error; } static void mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m) { struct mbuf *n, *prev = NULL; n = mpte->mpte_reinjectq; /* First, look for an mbuf n, whose data-sequence-number is bigger or * equal than m's sequence number. */ while (n) { if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) { break; } prev = n; n = n->m_nextpkt; } if (n) { /* m is already fully covered by the next mbuf in the queue */ if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn && n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) { os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen, m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen); goto dont_queue; } /* m is covering the next mbuf entirely, thus we remove this guy */ if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) { struct mbuf *tmp = n->m_nextpkt; os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen, (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen); m->m_nextpkt = NULL; if (prev == NULL) { mpte->mpte_reinjectq = tmp; } else { prev->m_nextpkt = tmp; } m_freem(n); n = tmp; } } if (prev) { /* m is already fully covered by the previous mbuf in the queue */ if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) { os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen); goto dont_queue; } } if (prev == NULL) { mpte->mpte_reinjectq = m; } else { prev->m_nextpkt = m; } m->m_nextpkt = n; return; dont_queue: m_freem(m); return; } static struct mbuf * mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn) { struct socket *mp_so = mptetoso(mpte); struct mbuf *m; m = mp_so->so_snd.sb_mb; while (m) { /* If this segment covers what we are looking for, return it. */ if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) && MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) { break; } /* Segment is no more in the queue */ if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) { return NULL; } m = m->m_next; } return m; } static struct mbuf * mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len) { struct mbuf *top = NULL, *tail = NULL; uint64_t dsn; uint32_t dlen, rseq; dsn = m->m_pkthdr.mp_dsn; dlen = m->m_pkthdr.mp_rlen; rseq = m->m_pkthdr.mp_rseq; while (len > 0) { struct mbuf *n; VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)); n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR); if (n == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto err; } VERIFY(n->m_flags & M_PKTHDR); VERIFY(n->m_next == NULL); VERIFY(n->m_pkthdr.mp_dsn == dsn); VERIFY(n->m_pkthdr.mp_rlen == dlen); VERIFY(n->m_pkthdr.mp_rseq == rseq); VERIFY(n->m_len == m->m_len); n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP); if (top == NULL) { top = n; } if (tail != NULL) { tail->m_next = n; } tail = n; len -= m->m_len; m = m->m_next; } return top; err: if (top) { m_freem(top); } return NULL; } static void mptcp_reinject_mbufs(struct socket *so) { struct tcpcb *tp = sototcpcb(so); struct mptsub *mpts = tp->t_mpsub; struct mptcb *mp_tp = tptomptp(tp); struct mptses *mpte = mp_tp->mpt_mpte; struct sockbuf *sb = &so->so_snd; struct mbuf *m; m = sb->sb_mb; while (m) { struct mbuf *n = m->m_next, *orig = m; bool set_reinject_flag = false; VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)); if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) { goto next; } /* Has it all already been acknowledged at the data-level? */ if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) { goto next; } /* Part of this has already been acknowledged - lookup in the * MPTCP-socket for the segment. */ if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) { m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn); if (m == NULL) { goto next; } } /* Copy the mbuf with headers (aka, DSN-numbers) */ m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen); if (m == NULL) { break; } VERIFY(m->m_nextpkt == NULL); /* Now, add to the reinject-queue, eliminating overlapping * segments */ mptcp_add_reinjectq(mpte, m); set_reinject_flag = true; orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ; next: /* mp_rlen can cover multiple mbufs, so advance to the end of it. */ while (n) { VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP)); if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) { break; } if (set_reinject_flag) { n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ; } n = n->m_next; } m = n; } } void mptcp_clean_reinjectq(struct mptses *mpte) { struct mptcb *mp_tp = mpte->mpte_mptcb; socket_lock_assert_owned(mptetoso(mpte)); while (mpte->mpte_reinjectq) { struct mbuf *m = mpte->mpte_reinjectq; if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) || MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) { break; } mpte->mpte_reinjectq = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } } static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { struct socket *mp_so, *so; struct mptcb *mp_tp; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; /* * We got an event for this subflow that might need to be propagated, * based on the state of the MPTCP connection. */ if (mp_tp->mpt_state < MPTCPS_ESTABLISHED || (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) || ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) { mp_so->so_error = so->so_error; *p_mpsofilt_hint |= event; } return MPTS_EVRET_OK; } /* * Handle SO_FILT_HINT_NOSRCADDR subflow socket event. */ static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { struct socket *mp_so; struct tcpcb *tp; mp_so = mptetoso(mpte); tp = intotcpcb(sotoinpcb(mpts->mpts_socket)); /* * This overwrites any previous mpte_lost_aid to avoid storing * too much state when the typical case has only two subflows. */ mpte->mpte_flags |= MPTE_SND_REM_ADDR; mpte->mpte_lost_aid = tp->t_local_aid; /* * The subflow connection has lost its source address. */ mptcp_subflow_abort(mpts, EADDRNOTAVAIL); if (mp_so->so_flags & SOF_NOADDRAVAIL) { mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event); } return MPTS_EVRET_DELETE; } static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, p_mpsofilt_hint) struct socket *so, *mp_so; so = mpts->mpts_socket; if (so->so_error != ENODATA) { return MPTS_EVRET_OK; } mp_so = mptetoso(mpte); mp_so->so_error = ENODATA; sorwakeup(mp_so); sowwakeup(mp_so); return MPTS_EVRET_OK; } /* * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that * indicates that the remote side sent a Data FIN */ static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, mpts) struct mptcb *mp_tp = mpte->mpte_mptcb; /* * We got a Data FIN for the MPTCP connection. * The FIN may arrive with data. The data is handed up to the * mptcp socket and the user is notified so that it may close * the socket if needed. */ if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) { *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE; } return MPTS_EVRET_OK; /* keep the subflow socket around */ } /* * Handle SO_FILT_HINT_MPFAILOVER subflow socket event */ static ev_ret_t mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, p_mpsofilt_hint) struct mptsub *mpts_alt = NULL; struct socket *alt_so = NULL; struct socket *mp_so; int altpath_exists = 0; mp_so = mptetoso(mpte); os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); mptcp_reinject_mbufs(mpts->mpts_socket); mpts_alt = mptcp_get_subflow(mpte, NULL); /* If there is no alternate eligible subflow, ignore the failover hint. */ if (mpts_alt == NULL || mpts_alt == mpts) { os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); goto done; } altpath_exists = 1; alt_so = mpts_alt->mpts_socket; if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) { /* All data acknowledged and no RTT spike */ if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) { mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER; } else { /* no alternate path available */ altpath_exists = 0; } } if (altpath_exists) { mpts_alt->mpts_flags |= MPTSF_ACTIVE; mpte->mpte_active_sub = mpts_alt; mpts->mpts_flags |= MPTSF_FAILINGOVER; mpts->mpts_flags &= ~MPTSF_ACTIVE; os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid); mptcpstats_inc_switch(mpte, mpts); sowwakeup(alt_so); } else { done: mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER; } return MPTS_EVRET_OK; } /* * Handle SO_FILT_HINT_IFDENIED subflow socket event. */ static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { /* * The subflow connection cannot use the outgoing interface, let's * close this subflow. */ mptcp_subflow_abort(mpts, EPERM); mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event); return MPTS_EVRET_DELETE; } /* * https://tools.ietf.org/html/rfc6052#section-2 * https://tools.ietf.org/html/rfc6147#section-5.2 */ static boolean_t mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr, const struct ipv6_prefix *prefix, struct in_addr *addrv4) { char buf[MAX_IPv4_STR_LEN]; char *ptrv4 = (char *)addrv4; const char *ptr = (const char *)addr; if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) { return false; } switch (prefix->prefix_len) { case NAT64_PREFIX_LEN_96: memcpy(ptrv4, ptr + 12, 4); break; case NAT64_PREFIX_LEN_64: memcpy(ptrv4, ptr + 9, 4); break; case NAT64_PREFIX_LEN_56: memcpy(ptrv4, ptr + 7, 1); memcpy(ptrv4 + 1, ptr + 9, 3); break; case NAT64_PREFIX_LEN_48: memcpy(ptrv4, ptr + 6, 2); memcpy(ptrv4 + 2, ptr + 9, 2); break; case NAT64_PREFIX_LEN_40: memcpy(ptrv4, ptr + 5, 3); memcpy(ptrv4 + 3, ptr + 9, 1); break; case NAT64_PREFIX_LEN_32: memcpy(ptrv4, ptr + 4, 4); break; default: panic("NAT64-prefix len is wrong: %u", prefix->prefix_len); } os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf))); return true; } static void mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts) { struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; struct socket *so = mpts->mpts_socket; struct ifnet *ifp; int j; /* Subflow IPs will be steered directly by the server - no need to * desynthesize. */ if (mpte->mpte_flags & MPTE_UNICAST_IP) { return; } ifp = sotoinpcb(so)->inp_last_outifp; if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) { return; } for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) { int success; if (nat64prefixes[j].prefix_len == 0) { continue; } success = mptcp_desynthesize_ipv6_addr(mpte, &mpte->__mpte_dst_v6.sin6_addr, &nat64prefixes[j], &mpte->mpte_sub_dst_v4.sin_addr); if (success) { mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4); mpte->mpte_sub_dst_v4.sin_family = AF_INET; mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port; /* * We connected to a NAT64'ed address. Let's remove it * from the potential IPs to use. Whenever we are back on * that network and need to connect, we can synthesize again. * * Otherwise, on different IPv6 networks we will attempt * to connect to that NAT64 address... */ memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6)); break; } } } static void mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts) { struct inpcb *inp; if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) { return; } inp = sotoinpcb(mpts->mpts_socket); if (inp == NULL) { return; } /* Should we try the alternate port? */ if (mpte->mpte_alternate_port && inp->inp_fport != mpte->mpte_alternate_port) { union sockaddr_in_4_6 dst; struct sockaddr_in *dst_in = SIN(&dst); SOCKADDR_COPY(&mpts->mpts_dst, &dst, mpts->mpts_dst.sa_len); dst_in->sin_port = mpte->mpte_alternate_port; mptcp_subflow_add(mpte, NULL, SA(&dst), mpts->mpts_ifscope, NULL); } else { /* Else, we tried all we could, mark this interface as non-MPTCP */ unsigned int i; if (inp->inp_last_outifp == NULL) { return; } for (i = 0; i < mpte->mpte_itfinfo_size; i++) { struct mpt_itf_info *info = &mpte->mpte_itfinfo[i]; if (inp->inp_last_outifp->if_index == info->ifindex) { info->no_mptcp_support = 1; break; } } } } /* If TFO data is succesfully acked, it must be dropped from the mptcp so */ static void mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts) { struct socket *mp_so = mptetoso(mpte); struct socket *so = mpts->mpts_socket; struct tcpcb *tp = intotcpcb(sotoinpcb(so)); struct mptcb *mp_tp = mpte->mpte_mptcb; /* If data was sent with SYN, rewind state */ if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) { u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna; unsigned int tcp_droplen = tp->snd_una - tp->iss - 1; VERIFY(mp_droplen <= (UINT_MAX)); VERIFY(mp_droplen >= tcp_droplen); mpts->mpts_flags &= ~MPTSF_TFO_REQD; mpts->mpts_iss += tcp_droplen; tp->t_mpflags &= ~TMPF_TFO_REQUEST; if (mp_droplen > tcp_droplen) { /* handle partial TCP ack */ mp_so->so_flags1 |= SOF1_TFO_REWIND; mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen); mp_droplen = tcp_droplen; } else { /* all data on SYN was acked */ mpts->mpts_rel_seq = 1; mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; } mp_tp->mpt_sndmax -= tcp_droplen; if (mp_droplen != 0) { VERIFY(mp_so->so_snd.sb_mb != NULL); sbdrop(&mp_so->so_snd, (int)mp_droplen); } } } /* * Handle SO_FILT_HINT_CONNECTED subflow socket event. */ static ev_ret_t mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, p_mpsofilt_hint) struct socket *mp_so, *so; struct inpcb *inp; struct tcpcb *tp; struct mptcb *mp_tp; int af; boolean_t mpok = FALSE; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; tp = sototcpcb(so); af = mpts->mpts_dst.sa_family; if (mpts->mpts_flags & MPTSF_CONNECTED) { return MPTS_EVRET_OK; } if ((mpts->mpts_flags & MPTSF_DISCONNECTED) || (mpts->mpts_flags & MPTSF_DISCONNECTING)) { return MPTS_EVRET_OK; } /* * The subflow connection has been connected. Find out whether it * is connected as a regular TCP or as a MPTCP subflow. The idea is: * * a. If MPTCP connection is not yet established, then this must be * the first subflow connection. If MPTCP failed to negotiate, * fallback to regular TCP by degrading this subflow. * * b. If MPTCP connection has been established, then this must be * one of the subsequent subflow connections. If MPTCP failed * to negotiate, disconnect the connection. * * Right now, we simply unblock any waiters at the MPTCP socket layer * if the MPTCP connection has not been established. */ if (so->so_state & SS_ISDISCONNECTED) { /* * With MPTCP joins, a connection is connected at the subflow * level, but the 4th ACK from the server elevates the MPTCP * subflow to connected state. So there is a small window * where the subflow could get disconnected before the * connected event is processed. */ return MPTS_EVRET_OK; } if (mpts->mpts_flags & MPTSF_TFO_REQD) { mptcp_drop_tfo_data(mpte, mpts); } mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD); mpts->mpts_flags |= MPTSF_CONNECTED; if (tp->t_mpflags & TMPF_MPTCP_TRUE) { mpts->mpts_flags |= MPTSF_MP_CAPABLE; } tp->t_mpflags &= ~TMPF_TFO_REQUEST; /* get/verify the outbound interface */ inp = sotoinpcb(so); mpts->mpts_maxseg = tp->t_maxseg; mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE); if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { mp_tp->mpt_state = MPTCPS_ESTABLISHED; mpte->mpte_associd = mpts->mpts_connid; DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, 0 /* event */); if (SOCK_DOM(so) == AF_INET) { in_getsockaddr_s(so, &mpte->__mpte_src_v4); } else { in6_getsockaddr_s(so, &mpte->__mpte_src_v6); } mpts->mpts_flags |= MPTSF_ACTIVE; /* case (a) above */ if (!mpok) { tcpstat.tcps_mpcap_fallback++; tp->t_mpflags |= TMPF_INFIN_SENT; mptcp_notify_mpfail(so); } else { if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && mptcp_subflows_need_backup_flag(mpte)) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); } else { mpts->mpts_flags |= MPTSF_PREFERRED; } mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; mpte->mpte_nummpcapflows++; if (SOCK_DOM(so) == AF_INET6) { mptcp_handle_ipv6_connection(mpte, mpts); } mptcp_check_subflows_and_add(mpte); if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) { mpte->mpte_initial_cell = 1; } mpte->mpte_handshake_success = 1; } mp_tp->mpt_sndwnd = tp->snd_wnd; mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt; mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna; soisconnected(mp_so); } else if (mpok) { /* * case (b) above * In case of additional flows, the MPTCP socket is not * MPTSF_MP_CAPABLE until an ACK is received from server * for 3-way handshake. TCP would have guaranteed that this * is an MPTCP subflow. */ if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && !(tp->t_mpflags & TMPF_BACKUP_PATH) && mptcp_subflows_need_backup_flag(mpte)) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); mpts->mpts_flags &= ~MPTSF_PREFERRED; } else { mpts->mpts_flags |= MPTSF_PREFERRED; } mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; mpte->mpte_nummpcapflows++; mpts->mpts_rel_seq = 1; mptcp_check_subflows_and_remove(mpte); } else { mptcp_try_alternate_port(mpte, mpts); tcpstat.tcps_join_fallback++; if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) { tcpstat.tcps_mptcp_cell_proxy++; } else { tcpstat.tcps_mptcp_wifi_proxy++; } soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); return MPTS_EVRET_OK; } /* This call, just to "book" an entry in the stats-table for this ifindex */ mptcpstats_get_index(mpte->mpte_itfstats, mpts); mptcp_output(mpte); return MPTS_EVRET_OK; /* keep the subflow socket around */ } /* * Handle SO_FILT_HINT_DISCONNECTED subflow socket event. */ static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, p_mpsofilt_hint) struct socket *mp_so, *so; struct mptcb *mp_tp; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; if (mpts->mpts_flags & MPTSF_DISCONNECTED) { return MPTS_EVRET_DELETE; } mpts->mpts_flags |= MPTSF_DISCONNECTED; /* The subflow connection has been disconnected. */ if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) { mpte->mpte_nummpcapflows--; if (mpte->mpte_active_sub == mpts) { mpte->mpte_active_sub = NULL; } mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET; } else { if (so->so_flags & SOF_MP_SEC_SUBFLOW && !(mpts->mpts_flags & MPTSF_CONNECTED)) { mptcp_try_alternate_port(mpte, mpts); } } if (mp_tp->mpt_state < MPTCPS_ESTABLISHED || ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) { mptcp_drop(mpte, mp_tp, so->so_error); } /* * Clear flags that are used by getconninfo to return state. * Retain like MPTSF_DELETEOK for internal purposes. */ mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING | MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED | MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE); return MPTS_EVRET_DELETE; } /* * Handle SO_FILT_HINT_MPSTATUS subflow socket event */ static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event, p_mpsofilt_hint) ev_ret_t ret = MPTS_EVRET_OK; struct socket *mp_so, *so; struct mptcb *mp_tp; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) { mpts->mpts_flags |= MPTSF_MP_CAPABLE; } else { mpts->mpts_flags &= ~MPTSF_MP_CAPABLE; } if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) { if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { goto done; } mpts->mpts_flags |= MPTSF_MP_DEGRADED; } else { mpts->mpts_flags &= ~MPTSF_MP_DEGRADED; } if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) { mpts->mpts_flags |= MPTSF_MP_READY; } else { mpts->mpts_flags &= ~MPTSF_MP_READY; } if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP; mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY; tcp_cache_update_mptcp_version(tp, FALSE); } if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { ret = MPTS_EVRET_DISCONNECT_FALLBACK; m_freem_list(mpte->mpte_reinjectq); mpte->mpte_reinjectq = NULL; } else if (mpts->mpts_flags & MPTSF_MP_READY) { mp_tp->mpt_flags |= MPTCPF_JOIN_READY; ret = MPTS_EVRET_CONNECT_PENDING; } done: return ret; } /* * Handle SO_FILT_HINT_MUSTRST subflow socket event */ static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event) struct socket *mp_so, *so; struct mptcb *mp_tp; boolean_t is_fastclose; mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; so = mpts->mpts_socket; /* We got an invalid option or a fast close */ struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = NULL; tp = intotcpcb(inp); so->so_error = ECONNABORTED; is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV); tp->t_mpflags |= TMPF_RESET; if (tp->t_state != TCPS_CLOSED) { struct mbuf *m; struct tcptemp *t_template = tcp_maketemplate(tp, &m); if (t_template) { struct tcp_respond_args tra; bzero(&tra, sizeof(tra)); if (inp->inp_flags & INP_BOUND_IF) { tra.ifscope = inp->inp_boundifp->if_index; } else { tra.ifscope = IFSCOPE_NONE; } tra.awdl_unrestricted = 1; tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una, TH_RST, &tra); (void) m_free(m); } } if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) { struct mptsub *iter, *tmp; *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET; mp_so->so_error = ECONNRESET; TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) { if (iter == mpts) { continue; } mptcp_subflow_abort(iter, ECONNABORTED); } /* * mptcp_drop is being called after processing the events, to fully * close the MPTCP connection */ mptcp_drop(mpte, mp_tp, mp_so->so_error); } mptcp_subflow_abort(mpts, ECONNABORTED); if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) { mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST; } return MPTS_EVRET_DELETE; } static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event) bool found_active = false; mpts->mpts_flags |= MPTSF_READ_STALL; TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (!TCPS_HAVEESTABLISHED(tp->t_state) || TCPS_HAVERCVDFIN2(tp->t_state)) { continue; } if (!(mpts->mpts_flags & MPTSF_READ_STALL)) { found_active = true; break; } } if (!found_active) { *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO; } return MPTS_EVRET_OK; } static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event) { #pragma unused(event) bool found_active = false; mpts->mpts_flags |= MPTSF_WRITE_STALL; TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (!TCPS_HAVEESTABLISHED(tp->t_state) || tp->t_state > TCPS_CLOSE_WAIT) { continue; } if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) { found_active = true; break; } } if (!found_active) { *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO; } return MPTS_EVRET_OK; } /* * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked, * caller must ensure that the option can be issued on subflow sockets, via * MPOF_SUBFLOW_OK flag. */ int mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo) { struct socket *mp_so, *so; struct sockopt sopt; int error; VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); mp_so = mptetoso(mpte); so = mpts->mpts_socket; socket_lock_assert_owned(mp_so); if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED && mpo->mpo_level == SOL_SOCKET && mpo->mpo_name == SO_MARK_CELLFALLBACK) { struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope]; /* * When we open a new subflow, mark it as cell fallback, if * this subflow goes over cell. * * (except for first-party apps) */ if (mpte->mpte_flags & MPTE_FIRSTPARTY) { return 0; } if (sotoinpcb(so)->inp_last_outifp && !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) { return 0; } /* * This here is an OR, because if the app is not binding to the * interface, then it definitely is not a cell-fallback * connection. */ if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { return 0; } } mpo->mpo_flags &= ~MPOF_INTERIM; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = mpo->mpo_level; sopt.sopt_name = mpo->mpo_name; sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); sopt.sopt_valsize = sizeof(int); sopt.sopt_p = kernproc; error = sosetoptlock(so, &sopt, 0); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: sopt %s " "val %d set error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), mpo->mpo_intval, error); } return error; } /* * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked, * caller must ensure that the option can be issued on subflow sockets, via * MPOF_SUBFLOW_OK flag. */ int mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so, struct mptopt *mpo) { struct socket *mp_so; struct sockopt sopt; int error; VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); mp_so = mptetoso(mpte); socket_lock_assert_owned(mp_so); bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = mpo->mpo_level; sopt.sopt_name = mpo->mpo_name; sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); sopt.sopt_valsize = sizeof(int); sopt.sopt_p = kernproc; error = sogetoptlock(so, &sopt, 0); /* already locked */ if (error) { os_log_error(mptcp_log_handle, "%s - %lx: sopt %s get error %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error); } return error; } /* * MPTCP garbage collector. * * This routine is called by the MP domain on-demand, periodic callout, * which is triggered when a MPTCP socket is closed. The callout will * repeat as long as this routine returns a non-zero value. */ static uint32_t mptcp_gc(struct mppcbinfo *mppi) { struct mppcb *mpp, *tmpp; uint32_t active = 0; LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED); TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) { struct socket *mp_so; struct mptses *mpte; struct mptcb *mp_tp; mp_so = mpp->mpp_socket; mpte = mptompte(mpp); mp_tp = mpte->mpte_mptcb; if (!mpp_try_lock(mpp)) { active++; continue; } VERIFY(mpp->mpp_flags & MPP_ATTACHED); /* check again under the lock */ if (mp_so->so_usecount > 0) { boolean_t wakeup = FALSE; struct mptsub *mpts, *tmpts; if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) { if (mp_tp->mpt_gc_ticks > 0) { mp_tp->mpt_gc_ticks--; } if (mp_tp->mpt_gc_ticks == 0) { wakeup = TRUE; } } if (wakeup) { TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { mptcp_subflow_eupcall1(mpts->mpts_socket, mpts, SO_FILT_HINT_DISCONNECTED); } } socket_unlock(mp_so, 0); active++; continue; } if (mpp->mpp_state != MPPCB_STATE_DEAD) { panic("%s - %lx: skipped state " "[u=%d,r=%d,s=%d]\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->so_usecount, mp_so->so_retaincnt, mpp->mpp_state); } if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) { mptcp_close(mpte, mp_tp); } mptcp_session_destroy(mpte); DTRACE_MPTCP4(dispose, struct socket *, mp_so, struct sockbuf *, &mp_so->so_rcv, struct sockbuf *, &mp_so->so_snd, struct mppcb *, mpp); mptcp_pcbdispose(mpp); sodealloc(mp_so); } return active; } /* * Drop a MPTCP connection, reporting the specified error. */ struct mptses * mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno) { struct socket *mp_so = mptetoso(mpte); VERIFY(mpte->mpte_mptcb == mp_tp); socket_lock_assert_owned(mp_so); DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, uint32_t, 0 /* event */); if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) { errno = mp_tp->mpt_softerror; } mp_so->so_error = errno; return mptcp_close(mpte, mp_tp); } /* * Close a MPTCP control block. */ struct mptses * mptcp_close(struct mptses *mpte, struct mptcb *mp_tp) { struct mptsub *mpts = NULL, *tmpts = NULL; struct socket *mp_so = mptetoso(mpte); socket_lock_assert_owned(mp_so); VERIFY(mpte->mpte_mptcb == mp_tp); mp_tp->mpt_state = MPTCPS_TERMINATE; mptcp_freeq(mp_tp); soisdisconnected(mp_so); /* Clean up all subflows */ TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { mptcp_subflow_disconnect(mpte, mpts); } return NULL; } void mptcp_notify_close(struct socket *so) { soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED)); } typedef struct mptcp_subflow_event_entry { uint32_t sofilt_hint_mask; ev_ret_t (*sofilt_hint_ev_hdlr)( struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint, uint32_t event); } mptsub_ev_entry_t; /* * XXX The order of the event handlers below is really * really important. Think twice before changing it. */ static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = { { .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR, .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE, .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER, .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_CONNRESET, .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_MUSTRST, .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE, .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT, .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR, .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_IFDENIED, .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_CONNECTED, .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS, .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED, .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO, .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev, }, { .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO, .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev, }, }; /* * Subflow socket control events. * * Called for handling events related to the underlying subflow socket. */ static ev_ret_t mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts, uint32_t *p_mpsofilt_hint) { ev_ret_t ret = MPTS_EVRET_OK; int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) / sizeof(mpsub_ev_entry_tbl[0]); /* bail if there's nothing to process */ if (!mpts->mpts_evctl) { return ret; } if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST | SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED | SO_FILT_HINT_DISCONNECTED)) { mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER; } DTRACE_MPTCP3(subflow__events, struct mptses *, mpte, struct mptsub *, mpts, uint32_t, mpts->mpts_evctl); /* * Process all the socket filter hints and reset the hint * once it is handled */ for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) { /* * Always execute the DISCONNECTED event, because it will wakeup * the app. */ if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) && (ret >= MPTS_EVRET_OK || mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) { mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask; ev_ret_t error = mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask); ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); } } return ret; } /* * MPTCP workloop. */ void mptcp_subflow_workloop(struct mptses *mpte) { boolean_t connect_pending = FALSE, disconnect_fallback = FALSE; uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED; struct mptsub *mpts, *tmpts; struct socket *mp_so; mp_so = mptetoso(mpte); socket_lock_assert_owned(mp_so); if (mpte->mpte_flags & MPTE_IN_WORKLOOP) { mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH; return; } mpte->mpte_flags |= MPTE_IN_WORKLOOP; relaunch: mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH; TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { ev_ret_t ret; if (mpts->mpts_socket->so_usecount == 0) { /* Will be removed soon by tcp_garbage_collect */ continue; } mptcp_subflow_addref(mpts); mpts->mpts_socket->so_usecount++; ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask); /* * If MPTCP socket is closed, disconnect all subflows. * This will generate a disconnect event which will * be handled during the next iteration, causing a * non-zero error to be returned above. */ if (mp_so->so_flags & SOF_PCBCLEARING) { mptcp_subflow_disconnect(mpte, mpts); } switch (ret) { case MPTS_EVRET_OK: /* nothing to do */ break; case MPTS_EVRET_DELETE: mptcp_subflow_soclose(mpts); break; case MPTS_EVRET_CONNECT_PENDING: connect_pending = TRUE; break; case MPTS_EVRET_DISCONNECT_FALLBACK: disconnect_fallback = TRUE; break; default: break; } mptcp_subflow_remref(mpts); /* ours */ VERIFY(mpts->mpts_socket->so_usecount != 0); mpts->mpts_socket->so_usecount--; } if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) { VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED); if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) { mp_so->so_state |= SS_CANTRCVMORE; sorwakeup(mp_so); } soevent(mp_so, mpsofilt_hint_mask); } if (!connect_pending && !disconnect_fallback) { goto exit; } TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { if (disconnect_fallback) { struct socket *so = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { continue; } mpts->mpts_flags |= MPTSF_MP_DEGRADED; if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) { continue; } so = mpts->mpts_socket; /* * The MPTCP connection has degraded to a fallback * mode, so there is no point in keeping this subflow * regardless of its MPTCP-readiness state, unless it * is the primary one which we use for fallback. This * assumes that the subflow used for fallback is the * ACTIVE one. */ inp = sotoinpcb(so); tp = intotcpcb(inp); tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE); tp->t_mpflags |= TMPF_TCP_FALLBACK; soevent(so, SO_FILT_HINT_MUSTRST); } else if (connect_pending) { /* * The MPTCP connection has progressed to a state * where it supports full multipath semantics; allow * additional joins to be attempted for all subflows * that are in the PENDING state. */ if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) { int error = mptcp_subflow_soconnectx(mpte, mpts); if (error) { mptcp_subflow_abort(mpts, error); } } } } exit: if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) { goto relaunch; } mpte->mpte_flags &= ~MPTE_IN_WORKLOOP; } /* * Protocol pr_lock callback. */ int mptcp_lock(struct socket *mp_so, int refcount, void *lr) { struct mppcb *mpp = mpsotomppcb(mp_so); void *lr_saved; if (lr == NULL) { lr_saved = __builtin_return_address(0); } else { lr_saved = lr; } if (mpp == NULL) { panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__, mp_so, lr_saved, solockhistory_nr(mp_so)); /* NOTREACHED */ } mpp_lock(mpp); if (mp_so->so_usecount < 0) { panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__, mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount, solockhistory_nr(mp_so)); /* NOTREACHED */ } if (refcount != 0) { mp_so->so_usecount++; mpp->mpp_inside++; } mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved; mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX; return 0; } /* * Protocol pr_unlock callback. */ int mptcp_unlock(struct socket *mp_so, int refcount, void *lr) { struct mppcb *mpp = mpsotomppcb(mp_so); void *lr_saved; if (lr == NULL) { lr_saved = __builtin_return_address(0); } else { lr_saved = lr; } if (mpp == NULL) { panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__, mp_so, mp_so->so_usecount, lr_saved, solockhistory_nr(mp_so)); /* NOTREACHED */ } socket_lock_assert_owned(mp_so); if (refcount != 0) { mp_so->so_usecount--; mpp->mpp_inside--; } if (mp_so->so_usecount < 0) { panic("%s: so=%p usecount=%x lrh= %s", __func__, mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); /* NOTREACHED */ } if (mpp->mpp_inside < 0) { panic("%s: mpp=%p inside=%x lrh= %s", __func__, mpp, mpp->mpp_inside, solockhistory_nr(mp_so)); /* NOTREACHED */ } mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved; mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX; mpp_unlock(mpp); return 0; } /* * Protocol pr_getlock callback. */ lck_mtx_t * mptcp_getlock(struct socket *mp_so, int flags) { struct mppcb *mpp = mpsotomppcb(mp_so); if (mpp == NULL) { panic("%s: so=%p NULL so_pcb %s", __func__, mp_so, solockhistory_nr(mp_so)); /* NOTREACHED */ } if (mp_so->so_usecount < 0) { panic("%s: so=%p usecount=%x lrh= %s", __func__, mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); /* NOTREACHED */ } return mpp_getlock(mpp, flags); } void mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand, u_int32_t *rrand) { struct mptcp_subf_auth_entry *sauth_entry; LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { if (sauth_entry->msae_laddr_id == addr_id) { if (lrand) { *lrand = sauth_entry->msae_laddr_rand; } if (rrand) { *rrand = sauth_entry->msae_raddr_rand; } break; } } } void mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp, mptcp_addr_id raddr_id, u_int32_t raddr_rand) { struct mptcp_subf_auth_entry *sauth_entry; LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { if (sauth_entry->msae_laddr_id == laddr_id) { if ((sauth_entry->msae_raddr_id != 0) && (sauth_entry->msae_raddr_id != raddr_id)) { os_log_error(mptcp_log_handle, "%s - %lx: mismatched" " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), raddr_id, sauth_entry->msae_raddr_id); return; } sauth_entry->msae_raddr_id = raddr_id; if ((sauth_entry->msae_raddr_rand != 0) && (sauth_entry->msae_raddr_rand != raddr_rand)) { os_log_error(mptcp_log_handle, "%s - %lx: " "dup SYN_ACK %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), raddr_rand, sauth_entry->msae_raddr_rand); return; } sauth_entry->msae_raddr_rand = raddr_rand; return; } } } /* * SHA-256 support for MPTCP */ static void mptcp_do_sha256(mptcp_key_t *key, char *sha_digest) { const unsigned char *sha2_base; int sha2_size; sha2_base = (const unsigned char *) key; sha2_size = sizeof(mptcp_key_t); SHA256_CTX sha_ctx; SHA256_Init(&sha_ctx); SHA256_Update(&sha_ctx, sha2_base, sha2_size); SHA256_Final(sha_digest, &sha_ctx); } void mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2, u_char *msg, uint16_t msg_len, u_char *digest) { SHA256_CTX sha_ctx; mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */ mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */ int i; bzero(digest, SHA256_DIGEST_LENGTH); /* Set up the Key for HMAC */ key_ipad[0] = key1; key_ipad[1] = key2; key_opad[0] = key1; key_opad[1] = key2; /* Key is 512 block length, so no need to compute hash */ /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */ for (i = 0; i < 8; i++) { key_ipad[i] ^= 0x3636363636363636; key_opad[i] ^= 0x5c5c5c5c5c5c5c5c; } /* Perform inner SHA256 */ SHA256_Init(&sha_ctx); SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad)); SHA256_Update(&sha_ctx, msg, msg_len); SHA256_Final(digest, &sha_ctx); /* Perform outer SHA256 */ SHA256_Init(&sha_ctx); SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad)); SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH); SHA256_Final(digest, &sha_ctx); } /* * SHA1 support for MPTCP */ static void mptcp_do_sha1(mptcp_key_t *key, char *sha_digest) { SHA1_CTX sha1ctxt; const unsigned char *sha1_base; int sha1_size; sha1_base = (const unsigned char *) key; sha1_size = sizeof(mptcp_key_t); SHA1Init(&sha1ctxt); SHA1Update(&sha1ctxt, sha1_base, sha1_size); SHA1Final(sha_digest, &sha1ctxt); } void mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2, u_int32_t rand1, u_int32_t rand2, u_char *digest) { SHA1_CTX sha1ctxt; mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */ mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */ u_int32_t data[2]; int i; bzero(digest, SHA1_RESULTLEN); /* Set up the Key for HMAC */ key_ipad[0] = key1; key_ipad[1] = key2; key_opad[0] = key1; key_opad[1] = key2; /* Set up the message for HMAC */ data[0] = rand1; data[1] = rand2; /* Key is 512 block length, so no need to compute hash */ /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */ for (i = 0; i < 8; i++) { key_ipad[i] ^= 0x3636363636363636; key_opad[i] ^= 0x5c5c5c5c5c5c5c5c; } /* Perform inner SHA1 */ SHA1Init(&sha1ctxt); SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad)); SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data)); SHA1Final(digest, &sha1ctxt); /* Perform outer SHA1 */ SHA1Init(&sha1ctxt); SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad)); SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN); SHA1Final(digest, &sha1ctxt); } /* * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A)) * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B)) */ void mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len) { uint32_t lrand, rrand; lrand = rrand = 0; mptcp_get_rands(aid, mp_tp, &lrand, &rrand); u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0}; if (mp_tp->mpt_version == MPTCP_VERSION_0) { mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest); } else { uint32_t data[2]; data[0] = lrand; data[1] = rrand; mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest); } bcopy(full_digest, digest, digest_len); } /* * Authentication data generation */ static void mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token, int token_len) { VERIFY(token_len == sizeof(u_int32_t)); VERIFY(sha_digest_len == SHA1_RESULTLEN || sha_digest_len == SHA256_DIGEST_LENGTH); /* Most significant 32 bits of the SHA1/SHA256 hash */ bcopy(sha_digest, token, sizeof(u_int32_t)); return; } static void mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn, int idsn_len, uint8_t mp_version) { VERIFY(idsn_len == sizeof(u_int64_t)); VERIFY(sha_digest_len == SHA1_RESULTLEN || sha_digest_len == SHA256_DIGEST_LENGTH); VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1); /* * Least significant 64 bits of the hash */ if (mp_version == MPTCP_VERSION_0) { idsn[7] = sha_digest[12]; idsn[6] = sha_digest[13]; idsn[5] = sha_digest[14]; idsn[4] = sha_digest[15]; idsn[3] = sha_digest[16]; idsn[2] = sha_digest[17]; idsn[1] = sha_digest[18]; idsn[0] = sha_digest[19]; } else { idsn[7] = sha_digest[24]; idsn[6] = sha_digest[25]; idsn[5] = sha_digest[26]; idsn[4] = sha_digest[27]; idsn[3] = sha_digest[28]; idsn[2] = sha_digest[29]; idsn[1] = sha_digest[30]; idsn[0] = sha_digest[31]; } return; } static void mptcp_conn_properties(struct mptcb *mp_tp) { /* Set DSS checksum flag */ if (mptcp_dss_csum) { mp_tp->mpt_flags |= MPTCPF_CHECKSUM; } /* Set up receive window */ mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp); /* Set up gc ticks */ mp_tp->mpt_gc_ticks = MPT_GC_TICKS; } static void mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst) { struct mptcb *mp_tp = mpte->mpte_mptcb; char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)]; uint16_t digest_len; if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) { mp_tp->mpt_version = MPTCP_VERSION_0; } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) { mp_tp->mpt_version = MPTCP_VERSION_1; } else { mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst); } VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 || mp_tp->mpt_version == MPTCP_VERSION_1); read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey)); if (mp_tp->mpt_version == MPTCP_VERSION_0) { digest_len = SHA1_RESULTLEN; mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest); } else { digest_len = SHA256_DIGEST_LENGTH; mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest); } mptcp_generate_token(key_digest, digest_len, (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken)); mptcp_generate_idsn(key_digest, digest_len, (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version); /* The subflow SYN is also first MPTCP byte */ mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1; mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; mptcp_conn_properties(mp_tp); } int mptcp_init_remote_parms(struct mptcb *mp_tp) { /* Setup local and remote tokens and Initial DSNs */ char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)]; uint16_t digest_len; if (mp_tp->mpt_version == MPTCP_VERSION_0) { digest_len = SHA1_RESULTLEN; mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest); } else if (mp_tp->mpt_version == MPTCP_VERSION_1) { digest_len = SHA256_DIGEST_LENGTH; mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest); } else { return -1; } mptcp_generate_token(remote_digest, digest_len, (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken)); mptcp_generate_idsn(remote_digest, digest_len, (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version); mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1; mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd; return 0; } static void mptcp_send_dfin(struct socket *so) { struct tcpcb *tp = NULL; struct inpcb *inp = NULL; inp = sotoinpcb(so); if (!inp) { return; } tp = intotcpcb(inp); if (!tp) { return; } if (!(tp->t_mpflags & TMPF_RESET)) { tp->t_mpflags |= TMPF_SEND_DFIN; } } /* * Data Sequence Mapping routines */ void mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m) { struct mptcb *mp_tp; if (m == NULL) { return; } mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb; while (m) { VERIFY(m->m_flags & M_PKTHDR); m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO); m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax; VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX); m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m); mp_tp->mpt_sndmax += m_pktlen(m); m = m->m_next; } } void mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len) { struct mptcb *mp_tp = tptomptp(sototcpcb(so)); uint64_t data_ack; uint64_t dsn; VERIFY(len >= 0); if (!m || len == 0) { return; } while (m && len > 0) { VERIFY(m->m_flags & M_PKTHDR); VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen; dsn = m->m_pkthdr.mp_dsn; len -= m->m_len; m = m->m_next; } if (m && len == 0) { /* * If there is one more mbuf in the chain, it automatically means * that up to m->mp_dsn has been ack'ed. * * This means, we actually correct data_ack back down (compared * to what we set inside the loop - dsn + data_len). Because in * the loop we are "optimistic" and assume that the full mapping * will be acked. If that's not the case and we get out of the * loop with m != NULL, it means only up to m->mp_dsn has been * really acked. */ data_ack = m->m_pkthdr.mp_dsn; } if (len < 0) { /* * If len is negative, meaning we acked in the middle of an mbuf, * only up to this mbuf's data-sequence number has been acked * at the MPTCP-level. */ data_ack = dsn; } /* We can have data in the subflow's send-queue that is being acked, * while the DATA_ACK has already advanced. Thus, we should check whether * or not the DATA_ACK is actually new here. */ if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) && MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) { mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack); } } void mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len) { int rewinding = 0; /* TFO makes things complicated. */ if (so->so_flags1 & SOF1_TFO_REWIND) { rewinding = 1; so->so_flags1 &= ~SOF1_TFO_REWIND; } while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) { u_int32_t sub_len; VERIFY(m->m_flags & M_PKTHDR); VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); sub_len = m->m_pkthdr.mp_rlen; if (sub_len < len) { m->m_pkthdr.mp_dsn += sub_len; if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { m->m_pkthdr.mp_rseq += sub_len; } m->m_pkthdr.mp_rlen = 0; len -= sub_len; } else { /* sub_len >= len */ if (rewinding == 0) { m->m_pkthdr.mp_dsn += len; } if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { if (rewinding == 0) { m->m_pkthdr.mp_rseq += len; } } m->m_pkthdr.mp_rlen -= len; break; } m = m->m_next; } if (so->so_flags & SOF_MP_SUBFLOW && !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) && !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) { /* * Received an ack without receiving a DATA_ACK. * Need to fallback to regular TCP (or destroy this subflow). */ sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT; mptcp_notify_mpfail(so); } } /* Obtain the DSN mapping stored in the mbuf */ void mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum) { u_int64_t dsn64; mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum); *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64); } void mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum) { struct mbuf *m = so->so_snd.sb_mb; VERIFY(off >= 0); if (m == NULL && (so->so_flags & SOF_DEFUNCT)) { *dsn = 0; *relseq = 0; *data_len = 0; *dss_csum = 0; return; } /* * In the subflow socket, the DSN sequencing can be discontiguous, * but the subflow sequence mapping is contiguous. Use the subflow * sequence property to find the right mbuf and corresponding dsn * mapping. */ while (m) { VERIFY(m->m_flags & M_PKTHDR); VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); if (off >= m->m_len) { off -= m->m_len; m = m->m_next; } else { break; } } VERIFY(off >= 0); VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX); *dsn = m->m_pkthdr.mp_dsn; *relseq = m->m_pkthdr.mp_rseq; *data_len = m->m_pkthdr.mp_rlen; *dss_csum = m->m_pkthdr.mp_csum; } void mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum) { uint64_t dsn; uint32_t relseq; mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum); } /* * Note that this is called only from tcp_input() via mptcp_input_preproc() * tcp_input() may trim data after the dsn mapping is inserted into the mbuf. * When it trims data tcp_input calls m_adj() which does not remove the * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf. * The dsn map insertion cannot be delayed after trim, because data can be in * the reassembly queue for a while and the DSN option info in tp will be * overwritten for every new packet received. * The dsn map will be adjusted just prior to appending to subflow sockbuf * with mptcp_adj_rmap() */ void mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) { VERIFY(m->m_flags & M_PKTHDR); VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)); if (tp->t_mpflags & TMPF_EMBED_DSN) { m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn; m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq; m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len; m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum; if (tp->t_rcv_map.mpt_dfin) { m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN; } m->m_pkthdr.pkt_flags |= PKTF_MPTCP; tp->t_mpflags &= ~TMPF_EMBED_DSN; tp->t_mpflags |= TMPF_MPTCP_ACKNOW; } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) { if (th->th_flags & TH_FIN) { m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN; } } } /* * Following routines help with failure detection and failover of data * transfer from one subflow to another. */ void mptcp_act_on_txfail(struct socket *so) { struct tcpcb *tp = NULL; struct inpcb *inp = sotoinpcb(so); if (inp == NULL) { return; } tp = intotcpcb(inp); if (tp == NULL) { return; } if (so->so_flags & SOF_MP_TRYFAILOVER) { return; } so->so_flags |= SOF_MP_TRYFAILOVER; soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER)); } /* * Support for MP_FAIL option */ int mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq) { struct mbuf *m = so->so_snd.sb_mb; uint16_t datalen; uint64_t dsn; int off = 0; if (m == NULL) { return -1; } while (m != NULL) { VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); VERIFY(m->m_flags & M_PKTHDR); dsn = m->m_pkthdr.mp_dsn; datalen = m->m_pkthdr.mp_rlen; if (MPTCP_SEQ_LEQ(dsn, dsn_fail) && (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) { off = (int)(dsn_fail - dsn); *tcp_seq = m->m_pkthdr.mp_rseq + off; return 0; } m = m->m_next; } /* * If there was no mbuf data and a fallback to TCP occurred, there's * not much else to do. */ os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail); return -1; } /* * Support for sending contiguous MPTCP bytes in subflow * Also for preventing sending data with ACK in 3-way handshake */ int32_t mptcp_adj_sendlen(struct socket *so, int32_t off) { struct tcpcb *tp = sototcpcb(so); struct mptsub *mpts = tp->t_mpsub; uint64_t mdss_dsn; uint32_t mdss_subflow_seq; int mdss_subflow_off; uint16_t mdss_data_len; uint16_t dss_csum; if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) { return 0; } mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq, &mdss_data_len, &dss_csum); /* * We need to compute how much of the mapping still remains. * So, we compute the offset in the send-buffer of the dss-sub-seq. */ mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una; /* * When TFO is used, we are sending the mpts->mpts_iss although the relative * seq has been set to 1 (while it should be 0). */ if (tp->t_mpflags & TMPF_TFO_REQUEST) { mdss_subflow_off--; } VERIFY(off >= mdss_subflow_off); return mdss_data_len - (off - mdss_subflow_off); } static uint32_t mptcp_get_maxseg(struct mptses *mpte) { struct mptsub *mpts; uint32_t maxseg = 0; TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (!TCPS_HAVEESTABLISHED(tp->t_state) || TCPS_HAVERCVDFIN2(tp->t_state)) { continue; } if (tp->t_maxseg > maxseg) { maxseg = tp->t_maxseg; } } return maxseg; } static uint8_t mptcp_get_rcvscale(struct mptses *mpte) { struct mptsub *mpts; uint8_t rcvscale = UINT8_MAX; TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { struct tcpcb *tp = sototcpcb(mpts->mpts_socket); if (!TCPS_HAVEESTABLISHED(tp->t_state) || TCPS_HAVERCVDFIN2(tp->t_state)) { continue; } if (tp->rcv_scale < rcvscale) { rcvscale = tp->rcv_scale; } } return rcvscale; } /* Similar to tcp_sbrcv_reserve */ static void mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv, u_int32_t newsize, u_int32_t idealsize) { uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte); if (rcvscale == UINT8_MAX) { return; } /* newsize should not exceed max */ newsize = min(newsize, tcp_autorcvbuf_max); /* The receive window scale negotiated at the * beginning of the connection will also set a * limit on the socket buffer size */ newsize = min(newsize, TCP_MAXWIN << rcvscale); /* Set new socket buffer size */ if (newsize > sbrcv->sb_hiwat && (sbreserve(sbrcv, newsize) == 1)) { sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize, (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max); /* Again check the limit set by the advertised * window scale */ sbrcv->sb_idealsize = min(sbrcv->sb_idealsize, TCP_MAXWIN << rcvscale); } } void mptcp_sbrcv_grow(struct mptcb *mp_tp) { struct mptses *mpte = mp_tp->mpt_mpte; struct socket *mp_so = mpte->mpte_mppcb->mpp_socket; struct sockbuf *sbrcv = &mp_so->so_rcv; uint32_t hiwat_sum = 0; uint32_t ideal_sum = 0; struct mptsub *mpts; /* * Do not grow the receive socket buffer if * - auto resizing is disabled, globally or on this socket * - the high water mark already reached the maximum * - the stream is in background and receive side is being * throttled * - if there are segments in reassembly queue indicating loss, * do not need to increase recv window during recovery as more * data is not going to be sent. A duplicate ack sent during * recovery should not change the receive window */ if (tcp_do_autorcvbuf == 0 || (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || sbrcv->sb_hiwat >= tcp_autorcvbuf_max || (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) || !LIST_EMPTY(&mp_tp->mpt_segq)) { /* Can not resize the socket buffer, just return */ return; } /* * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2) * * But, for this we first need accurate receiver-RTT estimations, which * we currently don't have. * * Let's use a dummy algorithm for now, just taking the sum of all * subflow's receive-buffers. It's too low, but that's all we can get * for now. */ TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat; ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize; } mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum); } /* * Determine if we can grow the recieve socket buffer to avoid sending * a zero window update to the peer. We allow even socket buffers that * have fixed size (set by the application) to grow if the resource * constraints are met. They will also be trimmed after the application * reads data. * * Similar to tcp_sbrcv_grow_rwin */ static void mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb) { struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket; u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4; u_int32_t rcvbuf = sb->sb_hiwat; if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) { return; } if (tcp_do_autorcvbuf == 1 && /* Diff to tcp_sbrcv_grow_rwin */ (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 && (rcvbuf - sb->sb_cc) < rcvbufinc && rcvbuf < tcp_autorcvbuf_max && (sb->sb_idealsize > 0 && sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) { sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max)); } } /* Similar to tcp_sbspace */ int32_t mptcp_sbspace(struct mptcb *mp_tp) { struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv; uint32_t rcvbuf; int32_t space; int32_t pending = 0; socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); mptcp_sbrcv_grow_rwin(mp_tp, sb); /* hiwat might have changed */ rcvbuf = sb->sb_hiwat; space = ((int32_t) imin((rcvbuf - sb->sb_cc), (sb->sb_mbmax - sb->sb_mbcnt))); if (space < 0) { space = 0; } #if CONTENT_FILTER /* Compensate for data being processed by content filters */ pending = cfil_sock_data_space(sb); #endif /* CONTENT_FILTER */ if (pending > space) { space = 0; } else { space -= pending; } return space; } /* * Support Fallback to Regular TCP */ void mptcp_notify_mpready(struct socket *so) { struct tcpcb *tp = NULL; if (so == NULL) { return; } tp = intotcpcb(sotoinpcb(so)); if (tp == NULL) { return; } DTRACE_MPTCP4(multipath__ready, struct socket *, so, struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, struct tcpcb *, tp); if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) { return; } if (tp->t_mpflags & TMPF_MPTCP_READY) { return; } tp->t_mpflags &= ~TMPF_TCP_FALLBACK; tp->t_mpflags |= TMPF_MPTCP_READY; soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); } void mptcp_notify_mpfail(struct socket *so) { struct tcpcb *tp = NULL; if (so == NULL) { return; } tp = intotcpcb(sotoinpcb(so)); if (tp == NULL) { return; } DTRACE_MPTCP4(multipath__failed, struct socket *, so, struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, struct tcpcb *, tp); if (tp->t_mpflags & TMPF_TCP_FALLBACK) { return; } tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE); tp->t_mpflags |= TMPF_TCP_FALLBACK; soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); } /* * Keepalive helper function */ boolean_t mptcp_ok_to_keepalive(struct mptcb *mp_tp) { boolean_t ret = 1; socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { ret = 0; } return ret; } /* * MPTCP t_maxseg adjustment function */ int mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc) { int mss_lower = 0; struct mptcb *mp_tp = tptomptp(tp); #define MPTCP_COMPUTE_LEN { \ mss_lower = sizeof (struct mptcp_dss_ack_opt); \ if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \ mss_lower += 2; \ else \ /* adjust to 32-bit boundary + EOL */ \ mss_lower += 2; \ } if (mp_tp == NULL) { return 0; } socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte)); /* * For the first subflow and subsequent subflows, adjust mss for * most common MPTCP option size, for case where tcp_mss is called * during option processing and MTU discovery. */ if (!mtudisc) { if (tp->t_mpflags & TMPF_MPTCP_TRUE && !(tp->t_mpflags & TMPF_JOINED_FLOW)) { MPTCP_COMPUTE_LEN; } if (tp->t_mpflags & TMPF_PREESTABLISHED && tp->t_mpflags & TMPF_SENT_JOIN) { MPTCP_COMPUTE_LEN; } } else { if (tp->t_mpflags & TMPF_MPTCP_TRUE) { MPTCP_COMPUTE_LEN; } } return mss_lower; } static void fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts) { struct inpcb *inp; tcp_getconninfo(so, &flow->flow_ci); inp = sotoinpcb(so); if ((inp->inp_vflag & INP_IPV6) != 0) { flow->flow_src.ss_family = AF_INET6; flow->flow_dst.ss_family = AF_INET6; flow->flow_src.ss_len = sizeof(struct sockaddr_in6); flow->flow_dst.ss_len = sizeof(struct sockaddr_in6); SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport; SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport; SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr; SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr; } else if ((inp->inp_vflag & INP_IPV4) != 0) { flow->flow_src.ss_family = AF_INET; flow->flow_dst.ss_family = AF_INET; flow->flow_src.ss_len = sizeof(struct sockaddr_in); flow->flow_dst.ss_len = sizeof(struct sockaddr_in); SIN(&flow->flow_src)->sin_port = inp->inp_lport; SIN(&flow->flow_dst)->sin_port = inp->inp_fport; SIN(&flow->flow_src)->sin_addr = inp->inp_laddr; SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr; } flow->flow_len = sizeof(*flow); flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci); flow->flow_flags = mpts->mpts_flags; flow->flow_cid = mpts->mpts_connid; flow->flow_relseq = mpts->mpts_rel_seq; flow->flow_soerror = mpts->mpts_socket->so_error; flow->flow_probecnt = mpts->mpts_probecnt; } static int mptcp_pcblist SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int error = 0, f; size_t len; struct mppcb *mpp; struct mptses *mpte; struct mptcb *mp_tp; struct mptsub *mpts; struct socket *so; conninfo_mptcp_t mptcpci; mptcp_flow_t *flows = NULL; if (req->newptr != USER_ADDR_NULL) { return EPERM; } lck_mtx_lock(&mtcbinfo.mppi_lock); if (req->oldptr == USER_ADDR_NULL) { size_t n = mtcbinfo.mppi_count; lck_mtx_unlock(&mtcbinfo.mppi_lock); req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) + 4 * (n + n / 8) * sizeof(mptcp_flow_t); return 0; } TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { flows = NULL; socket_lock(mpp->mpp_socket, 1); VERIFY(mpp->mpp_flags & MPP_ATTACHED); mpte = mptompte(mpp); socket_lock_assert_owned(mptetoso(mpte)); mp_tp = mpte->mpte_mptcb; bzero(&mptcpci, sizeof(mptcpci)); mptcpci.mptcpci_state = mp_tp->mpt_state; mptcpci.mptcpci_flags = mp_tp->mpt_flags; mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken; mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken; mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat; mptcpci.mptcpci_snduna = mp_tp->mpt_snduna; mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt; mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax; mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn; mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd; mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt; mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt; mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn; mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd; mptcpci.mptcpci_nflows = mpte->mpte_numflows; mptcpci.mptcpci_mpte_flags = mpte->mpte_flags; mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last; mptcpci.mptcpci_flow_offset = offsetof(conninfo_mptcp_t, mptcpci_flows); len = sizeof(*flows) * mpte->mpte_numflows; if (mpte->mpte_numflows != 0) { flows = kalloc_data(len, Z_WAITOK | Z_ZERO); if (flows == NULL) { socket_unlock(mpp->mpp_socket, 1); break; } mptcpci.mptcpci_len = sizeof(mptcpci) + sizeof(*flows) * (mptcpci.mptcpci_nflows - 1); error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci) - sizeof(mptcp_flow_t)); } else { mptcpci.mptcpci_len = sizeof(mptcpci); error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci)); } if (error) { socket_unlock(mpp->mpp_socket, 1); kfree_data(flows, len); break; } f = 0; TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { so = mpts->mpts_socket; fill_mptcp_subflow(so, &flows[f], mpts); f++; } socket_unlock(mpp->mpp_socket, 1); if (flows) { error = SYSCTL_OUT(req, flows, len); kfree_data(flows, len); if (error) { break; } } } lck_mtx_unlock(&mtcbinfo.mppi_lock); return error; } SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t", "List of active MPTCP connections"); /* * Set notsent lowat mark on the MPTCB */ int mptcp_set_notsent_lowat(struct mptses *mpte, int optval) { struct mptcb *mp_tp = NULL; int error = 0; if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) { mp_tp = mpte->mpte_mptcb; } if (mp_tp) { mp_tp->mpt_notsent_lowat = optval; } else { error = EINVAL; } return error; } u_int32_t mptcp_get_notsent_lowat(struct mptses *mpte) { struct mptcb *mp_tp = NULL; if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) { mp_tp = mpte->mpte_mptcb; } if (mp_tp) { return mp_tp->mpt_notsent_lowat; } else { return 0; } } int mptcp_notsent_lowat_check(struct socket *so) { struct mptses *mpte; struct mppcb *mpp; struct mptcb *mp_tp; struct mptsub *mpts; int notsent = 0; mpp = mpsotomppcb(so); if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { return 0; } mpte = mptompte(mpp); socket_lock_assert_owned(mptetoso(mpte)); mp_tp = mpte->mpte_mptcb; notsent = so->so_snd.sb_cc; if ((notsent == 0) || ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <= mp_tp->mpt_notsent_lowat)) { return 1; } /* When Nagle's algorithm is not disabled, it is better * to wakeup the client even before there is atleast one * maxseg of data to write. */ TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { int retval = 0; if (mpts->mpts_flags & MPTSF_ACTIVE) { struct socket *subf_so = mpts->mpts_socket; struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so)); notsent = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); if ((tp->t_flags & TF_NODELAY) == 0 && notsent > 0 && (notsent <= (int)tp->t_maxseg)) { retval = 1; } return retval; } } return 0; } static errno_t mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, void **unitinfo) { #pragma unused(kctlref, sac, unitinfo) if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) { os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__); } mptcp_kern_skt_unit = sac->sc_unit; return 0; } static void mptcp_allow_uuid(uuid_t uuid, int32_t rssi) { struct mppcb *mpp; /* Iterate over all MPTCP connections */ lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { struct socket *mp_so = mpp->mpp_socket; struct mptses *mpte = mpp->mpp_pcbe; socket_lock(mp_so, 1); if (mp_so->so_flags & SOF_DELEGATED && uuid_compare(uuid, mp_so->e_uuid)) { goto next; } else if (!(mp_so->so_flags & SOF_DELEGATED) && uuid_compare(uuid, mp_so->last_uuid)) { goto next; } os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi); mpte->mpte_flags |= MPTE_ACCESS_GRANTED; if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) { mpte->mpte_flags |= MPTE_CELL_PROHIBITED; } mptcp_check_subflows_and_add(mpte); mptcp_remove_subflows(mpte); mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED); next: socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); } static void mptcp_wifi_status_changed(void) { struct mppcb *mpp; /* Iterate over all MPTCP connections */ lck_mtx_lock(&mtcbinfo.mppi_lock); TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { struct socket *mp_so = mpp->mpp_socket; struct mptses *mpte = mpp->mpp_pcbe; socket_lock(mp_so, 1); /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */ if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER && mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER && mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) { goto next; } mptcp_check_subflows_and_add(mpte); mptcp_check_subflows_and_remove(mpte); next: socket_unlock(mp_so, 1); } lck_mtx_unlock(&mtcbinfo.mppi_lock); } struct mptcp_uuid_search_info { uuid_t target_uuid; proc_t found_proc; boolean_t is_proc_found; }; static int mptcp_find_proc_filter(proc_t p, void *arg) { struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg; int found; if (info->is_proc_found) { return 0; } /* * uuid_compare returns 0 if the uuids are matching, but the proc-filter * expects != 0 for a matching filter. */ found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0; if (found) { info->is_proc_found = true; } return found; } static int mptcp_find_proc_callout(proc_t p, void * arg) { struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg; if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) { info->found_proc = p; return PROC_CLAIMED_DONE; } return PROC_RETURNED; } static proc_t mptcp_find_proc(const uuid_t uuid) { struct mptcp_uuid_search_info info; uuid_copy(info.target_uuid, uuid); info.found_proc = PROC_NULL; info.is_proc_found = false; proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info, mptcp_find_proc_filter, &info); return info.found_proc; } void mptcp_ask_symptoms(struct mptses *mpte) { struct mptcp_symptoms_ask_uuid ask; struct socket *mp_so; struct proc *p = PROC_NULL; int pid, prio, err; if (mptcp_kern_skt_unit == 0) { os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); return; } mp_so = mptetoso(mpte); if (mp_so->so_flags & SOF_DELEGATED) { if (mpte->mpte_epid != 0) { p = proc_find(mpte->mpte_epid); if (p != PROC_NULL) { /* We found a pid, check its UUID */ if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) { /* It's not the same - we need to look for the real proc */ proc_rele(p); p = PROC_NULL; } } } if (p == PROC_NULL) { p = mptcp_find_proc(mp_so->e_uuid); if (p == PROC_NULL) { uuid_string_t uuid_string; uuid_unparse(mp_so->e_uuid, uuid_string); os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string); return; } mpte->mpte_epid = proc_pid(p); } pid = mpte->mpte_epid; uuid_copy(ask.uuid, mp_so->e_uuid); } else { pid = mp_so->last_pid; p = proc_find(pid); if (p == PROC_NULL) { os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid); return; } uuid_copy(ask.uuid, mp_so->last_uuid); } ask.cmd = MPTCP_SYMPTOMS_ASK_UUID; prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE); if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION || prio == TASK_DARWINBG_APPLICATION) { ask.priority = MPTCP_SYMPTOMS_BACKGROUND; } else if (prio == TASK_FOREGROUND_APPLICATION) { ask.priority = MPTCP_SYMPTOMS_FOREGROUND; } else { ask.priority = MPTCP_SYMPTOMS_UNKNOWN; } err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit, &ask, sizeof(ask), CTL_DATA_EOR); os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err); proc_rele(p); } static errno_t mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo) { #pragma unused(kctlref, kcunit, unitinfo) OSDecrementAtomic(&mptcp_kern_skt_inuse); return 0; } static errno_t mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, int flags) { #pragma unused(kctlref, unitinfo, flags) symptoms_advisory_t *sa = NULL; if (kcunit != mptcp_kern_skt_unit) { os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n", __func__, kcunit, mptcp_kern_skt_unit); } if (mbuf_pkthdr_len(m) < sizeof(*sa)) { mbuf_freem(m); return EINVAL; } if (mbuf_len(m) < sizeof(*sa)) { os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n", __func__, mbuf_len(m), sizeof(*sa)); mbuf_freem(m); return EINVAL; } sa = mbuf_data(m); if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) { os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status, sa->sa_cell_status, mptcp_advisory.sa_cell_status); if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) { mptcp_advisory.sa_wifi_status = sa->sa_wifi_status; mptcp_wifi_status_changed(); } } else { struct mptcp_symptoms_answer answer; errno_t err; /* We temporarily allow different sizes for ease of submission */ if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) && mbuf_len(m) != sizeof(answer)) { os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n", __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa), sizeof(answer)); mbuf_free(m); return EINVAL; } memset(&answer, 0, sizeof(answer)); err = mbuf_copydata(m, 0, mbuf_len(m), &answer); if (err) { os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err); mbuf_free(m); return err; } mptcp_allow_uuid(answer.uuid, answer.rssi); } mbuf_freem(m); return 0; } void mptcp_control_register(void) { /* Set up the advisory control socket */ struct kern_ctl_reg mptcp_kern_ctl; bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl)); strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME, sizeof(mptcp_kern_ctl.ctl_name)); mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect; mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect; mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send; mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED; (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref); } mptcp_wifi_quality_t mptcp_wifi_quality_for_session(struct mptses *mpte) { if (mpte->mpte_flags & MPTE_FIRSTPARTY) { if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER && mptcp_advisory.sa_wifi_status) { return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD; } /* * If it's a first-party app and we don't have any info * about the Wi-Fi state, let's be pessimistic. */ return MPTCP_WIFI_QUALITY_UNSURE; } else { if (symptoms_is_wifi_lossy()) { return MPTCP_WIFI_QUALITY_BAD; } /* * If we are target-based (meaning, we allow to be more lax on * the when wifi is considered bad), we only *know* about the state once * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED). * * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then * be set. * * In any other case (while in target-mode), consider WiFi bad * and we are going to ask for allowance from Symptoms anyway. */ if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) { if (mpte->mpte_flags & MPTE_ACCESS_GRANTED && mpte->mpte_flags & MPTE_CELL_PROHIBITED) { return MPTCP_WIFI_QUALITY_GOOD; } return MPTCP_WIFI_QUALITY_BAD; } return MPTCP_WIFI_QUALITY_GOOD; } } boolean_t symptoms_is_wifi_lossy(void) { return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true; } int mptcp_freeq(struct mptcb *mp_tp) { struct tseg_qent *q; int rv = 0; int count = 0; while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); zfree(tcp_reass_zone, q); count++; rv = 1; } mp_tp->mpt_reassqlen = 0; if (count > 0) { OSAddAtomic(-count, &mptcp_reass_total_qlen); } return rv; } static int mptcp_post_event(u_int32_t event_code, int value) { struct kev_mptcp_data event_data; struct kev_msg ev_msg; memset(&ev_msg, 0, sizeof(ev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS; ev_msg.event_code = event_code; event_data.value = value; ev_msg.dv[0].data_ptr = &event_data; ev_msg.dv[0].data_length = sizeof(event_data); return kev_post_msg(&ev_msg); } static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts) { struct tcpcb *tp = sototcpcb(mpts->mpts_socket); int error; /* First-party apps (Siri) don't flip the cellicon */ if (mpte->mpte_flags & MPTE_FIRSTPARTY) { return; } /* Subflow is disappearing - don't set it on this one */ if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) { return; } /* Fallen back connections are not triggering the cellicon */ if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { return; } /* Remember the last time we set the cellicon. Needed for debouncing */ mpte->mpte_last_cellicon_set = tcp_now; tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE); tcp_sched_timers(tp); if (mpts->mpts_flags & MPTSF_CELLICON_SET && mpte->mpte_cellicon_increments != 0) { if (mptcp_cellicon_refcount == 0) { os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); /* Continue, so that the icon gets set... */ } else { /* * In this case, the cellicon is already set. No need to bump it * even higher */ return; } } /* When tearing down this subflow, we need to decrement the * reference counter */ mpts->mpts_flags |= MPTSF_CELLICON_SET; /* This counter, so that when a session gets destroyed we decrement * the reference counter by whatever is left */ mpte->mpte_cellicon_increments++; if (OSIncrementAtomic(&mptcp_cellicon_refcount)) { /* If cellicon is already set, get out of here! */ return; } error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1); if (error) { os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error); } else { os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); } } void mptcp_clear_cellicon(void) { int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0); if (error) { os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n", __func__, error); } else { os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n", __func__); } } /* * Returns true if the icon has been flipped to WiFi. */ static boolean_t __mptcp_unset_cellicon(uint32_t val) { VERIFY(val < INT32_MAX); if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) { return false; } mptcp_clear_cellicon(); return true; } void mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val) { /* First-party apps (Siri) don't flip the cellicon */ if (mpte->mpte_flags & MPTE_FIRSTPARTY) { return; } if (mpte->mpte_cellicon_increments == 0) { /* This flow never used cell - get out of here! */ return; } if (mptcp_cellicon_refcount == 0) { os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); return; } if (mpts) { if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) { return; } mpts->mpts_flags &= ~MPTSF_CELLICON_SET; } if (mpte->mpte_cellicon_increments < val) { os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val); val = mpte->mpte_cellicon_increments; } mpte->mpte_cellicon_increments -= val; if (__mptcp_unset_cellicon(val) == false) { return; } /* All flows are gone - our counter should be at zero too! */ if (mpte->mpte_cellicon_increments != 0) { os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments); } } void mptcp_reset_rexmit_state(struct tcpcb *tp) { struct mptsub *mpts; struct inpcb *inp; struct socket *so; inp = tp->t_inpcb; if (inp == NULL) { return; } so = inp->inp_socket; if (so == NULL) { return; } if (!(so->so_flags & SOF_MP_SUBFLOW)) { return; } mpts = tp->t_mpsub; mpts->mpts_flags &= ~MPTSF_WRITE_STALL; so->so_flags &= ~SOF_MP_TRYFAILOVER; } void mptcp_reset_keepalive(struct tcpcb *tp) { struct mptsub *mpts = tp->t_mpsub; mpts->mpts_flags &= ~MPTSF_READ_STALL; } static struct mppcb * mtcp_alloc(void) { return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp; } static void mtcp_free(struct mppcb *mpp) { struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp); kfree_type(struct mpp_mtp, mtp); } /* * Protocol pr_init callback. */ void mptcp_init(struct protosw *pp, struct domain *dp) { #pragma unused(dp) static int mptcp_initialized = 0; struct protosw *prp; struct ip6protosw *prp6; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); /* do this only once */ if (mptcp_initialized) { return; } mptcp_initialized = 1; mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK; /* * Since PF_MULTIPATH gets initialized after PF_INET/INET6, * we must be able to find IPPROTO_TCP entries for both. */ prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM); VERIFY(prp != NULL); bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp)); bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs, sizeof(mptcp_subflow_usrreqs)); mptcp_subflow_protosw.pr_entry.tqe_next = NULL; mptcp_subflow_protosw.pr_entry.tqe_prev = NULL; mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs; mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive; mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend; mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp; /* * Socket filters shouldn't attach/detach to/from this protosw * since pr_protosw is to be used instead, which points to the * real protocol; if they do, it is a bug and we should panic. */ mptcp_subflow_protosw.pr_filter_head.tqh_first = (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; mptcp_subflow_protosw.pr_filter_head.tqh_last = (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6, IPPROTO_TCP, SOCK_STREAM); VERIFY(prp6 != NULL); bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6)); bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6, sizeof(mptcp_subflow_usrreqs6)); mptcp_subflow_protosw6.pr_entry.tqe_next = NULL; mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL; mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6; mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive; mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend; mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp; /* * Socket filters shouldn't attach/detach to/from this protosw * since pr_protosw is to be used instead, which points to the * real protocol; if they do, it is a bug and we should panic. */ mptcp_subflow_protosw6.pr_filter_head.tqh_first = (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; mptcp_subflow_protosw6.pr_filter_head.tqh_last = (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; bzero(&mtcbinfo, sizeof(mtcbinfo)); TAILQ_INIT(&mtcbinfo.mppi_pcbs); mtcbinfo.mppi_alloc = mtcp_alloc; mtcbinfo.mppi_free = mtcp_free; mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL); lck_attr_setdefault(&mtcbinfo.mppi_lock_attr); lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp, &mtcbinfo.mppi_lock_attr); mtcbinfo.mppi_gc = mptcp_gc; mtcbinfo.mppi_timer = mptcp_timer; /* attach to MP domain for garbage collection to take place */ mp_pcbinfo_attach(&mtcbinfo); mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp"); }