/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include /* XXX All of these should really be derived from syscall_sw.h */ #if defined (__x86_64__) #define SYSCALL_CLASS_SHIFT 24 #define SYSCALL_CLASS_MASK (0xFF << SYSCALL_CLASS_SHIFT) #define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK) #define I386_SYSCALL_NUMBER_MASK (0xFFFF) #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "systrace.h" #include #include #include #include #include #if defined (__x86_64__) #define SYSTRACE_ARTIFICIAL_FRAMES 2 #define MACHTRACE_ARTIFICIAL_FRAMES 3 #elif defined(__arm64__) #define SYSTRACE_ARTIFICIAL_FRAMES 2 #define MACHTRACE_ARTIFICIAL_FRAMES 3 #else #error Unknown Architecture #endif #define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0])) #define MACHTRACE_NARGS (int)(sizeof(struct mach_call_args) / sizeof(syscall_arg_t)) #include #define sy_callc sy_call /* Map Solaris slot name to Darwin's */ #define NSYSCALL nsysent /* and is less than 500 or so */ extern const char *syscallnames[]; #include #define casptr dtrace_casptr #define membar_enter dtrace_membar_producer #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */ #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */ static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock, &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */ systrace_sysent_t *systrace_sysent = NULL; void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int); static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); void systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { #pragma unused(id,arg0,arg1,arg2,arg3,arg4) } int32_t dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) { unsigned short code; /* The system call number */ systrace_sysent_t *sy; dtrace_id_t id; int32_t rval; syscall_arg_t *ip = (syscall_arg_t *)uap; uint64_t uargs[SYSTRACE_NARGS] = {0}; #if defined (__x86_64__) { pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { x86_saved_state64_t *regs = saved_state64(tagged_regs); code = regs->rax & SYSCALL_NUMBER_MASK; /* * Check for indirect system call... system call number * passed as 'arg0' */ if (code == 0) { code = regs->rdi; } } else { code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK; if (code == 0) { vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int)); code = fuword(params); } } } #elif defined(__arm64__) { /* * On arm64, syscall numbers depend on a flavor (indirect or not) * ... and for u32 can be in either r0 or r12 * ... and for u64 can be in either x0 or x16 */ /* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */ arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread()); if (is_saved_state32(arm_regs)) { /* Check for indirect system call */ if (saved_state32(arm_regs)->r[12] != 0) { code = saved_state32(arm_regs)->r[12]; } else { code = saved_state32(arm_regs)->r[0]; } } else { /* Check for indirect system call */ if (saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) { code = saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM]; } else { code = saved_state64(arm_regs)->x[0]; } } } #else #error Unknown Architecture #endif // Bounds "check" the value of code a la unix_syscall sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; systrace_args(code, ip, uargs); if ((id = sy->stsy_entry) != DTRACE_IDNONE) { uthread_t uthread = current_uthread(); if (uthread) { uthread->t_dtrace_syscall_args = uargs; } static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments"); (*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]); if (uthread) { uthread->t_dtrace_syscall_args = NULL; } } #if 0 /* XXX */ /* * APPLE NOTE: Not implemented. * We want to explicitly allow DTrace consumers to stop a process * before it actually executes the meat of the syscall. */ p = ttoproc(curthread); mutex_enter(&p->p_lock); if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) { curthread->t_dtrace_stop = 0; stop(PR_REQUESTED, 0); } mutex_exit(&p->p_lock); #endif rval = (*sy->stsy_underlying)(pp, uap, rv); if ((id = sy->stsy_return) != DTRACE_IDNONE) { uint64_t munged_rv0, munged_rv1; uthread_t uthread = current_uthread(); if (uthread) { uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */ } /* * "Decode" rv for use in the call to dtrace_probe() */ if (rval == ERESTART) { munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */ munged_rv1 = -1LL; } else if (rval != EJUSTRETURN) { if (rval) { munged_rv0 = -1LL; /* Mimic what libc will do. */ munged_rv1 = -1LL; } else { switch (sy->stsy_return_type) { case _SYSCALL_RET_INT_T: munged_rv0 = rv[0]; munged_rv1 = rv[1]; break; case _SYSCALL_RET_UINT_T: munged_rv0 = ((u_int)rv[0]); munged_rv1 = ((u_int)rv[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_UINT64_T: munged_rv0 = *(u_int64_t *)rv; munged_rv1 = 0LL; break; case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: munged_rv0 = *(user_addr_t *)rv; munged_rv1 = 0LL; break; case _SYSCALL_RET_NONE: munged_rv0 = 0LL; munged_rv1 = 0LL; break; default: munged_rv0 = 0LL; munged_rv1 = 0LL; break; } } } else { munged_rv0 = 0LL; munged_rv1 = 0LL; } /* * says: * * "This is a bit of an historical artifact. At first, the syscall provider just * had its return value in arg0, and the fbt and pid providers had their return * values in arg1 (so that we could use arg0 for the offset of the return site). * * We inevitably started writing scripts where we wanted to see the return * values from probes in all three providers, and we made this script easier * to write by replicating the syscall return values in arg1 to match fbt and * pid. We debated briefly about removing the return value from arg0, but * decided that it would be less confusing to have the same data in two places * than to have some non-helpful, non-intuitive value in arg0. * * This change was made 4/23/2003 according to the DTrace project's putback log." */ (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); } return rval; } void dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv) { systrace_sysent_t *sy; dtrace_id_t id; // Bounds "check" the value of code a la unix_syscall_return sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code]; if ((id = sy->stsy_return) != DTRACE_IDNONE) { uint64_t munged_rv0, munged_rv1; uthread_t uthread = current_uthread(); if (uthread) { uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */ } /* * "Decode" rv for use in the call to dtrace_probe() */ if (rval == ERESTART) { munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */ munged_rv1 = -1LL; } else if (rval != EJUSTRETURN) { if (rval) { munged_rv0 = -1LL; /* Mimic what libc will do. */ munged_rv1 = -1LL; } else { switch (sy->stsy_return_type) { case _SYSCALL_RET_INT_T: munged_rv0 = rv[0]; munged_rv1 = rv[1]; break; case _SYSCALL_RET_UINT_T: munged_rv0 = ((u_int)rv[0]); munged_rv1 = ((u_int)rv[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_UINT64_T: munged_rv0 = *(u_int64_t *)rv; munged_rv1 = 0LL; break; case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: munged_rv0 = *(user_addr_t *)rv; munged_rv1 = 0LL; break; case _SYSCALL_RET_NONE: munged_rv0 = 0LL; munged_rv1 = 0LL; break; default: munged_rv0 = 0LL; munged_rv1 = 0LL; break; } } } else { munged_rv0 = 0LL; munged_rv1 = 0LL; } (*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0); } } #define SYSTRACE_SHIFT 16 #define SYSTRACE_ISENTRY(x) ((int)(x) >> SYSTRACE_SHIFT) #define SYSTRACE_SYSNUM(x) ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1)) #define SYSTRACE_ENTRY(id) ((1 << SYSTRACE_SHIFT) | (id)) #define SYSTRACE_RETURN(id) (id) #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL) #error 1 << SYSTRACE_SHIFT must exceed number of system calls #endif static dtrace_provider_id_t systrace_id; /* * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol. * See balanced undef below. */ #define systrace_init _systrace_init static void systrace_init(const struct sysent *actual, systrace_sysent_t **interposed) { systrace_sysent_t *ssysent = *interposed; /* Avoid sysent shadow warning * from bsd/sys/sysent.h */ unsigned int i; if (ssysent == NULL) { *interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) * NSYSCALL, KM_SLEEP); } for (i = 0; i < NSYSCALL; i++) { /* Use of volatile protects the if statement below from being optimized away */ const volatile struct sysent *a = &actual[i]; systrace_sysent_t *s = &ssysent[i]; if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) { continue; } if (a->sy_callc == dtrace_systrace_syscall) { continue; } s->stsy_underlying = a->sy_callc; s->stsy_return_type = a->sy_return_type; } } /*ARGSUSED*/ static void systrace_provide(void *arg, const dtrace_probedesc_t *desc) { #pragma unused(arg) /* __APPLE__ */ unsigned int i; if (desc != NULL) { return; } systrace_init(sysent, &systrace_sysent); for (i = 0; i < NSYSCALL; i++) { if (systrace_sysent[i].stsy_underlying == NULL) { continue; } if (dtrace_probe_lookup(systrace_id, NULL, syscallnames[i], "entry") != 0) { continue; } (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i], "entry", SYSTRACE_ARTIFICIAL_FRAMES, (void *)((uintptr_t)SYSTRACE_ENTRY(i))); (void) dtrace_probe_create(systrace_id, NULL, syscallnames[i], "return", SYSTRACE_ARTIFICIAL_FRAMES, (void *)((uintptr_t)SYSTRACE_RETURN(i))); systrace_sysent[i].stsy_entry = DTRACE_IDNONE; systrace_sysent[i].stsy_return = DTRACE_IDNONE; } } #undef systrace_init /*ARGSUSED*/ static void systrace_destroy(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); #pragma unused(sysnum) /* __APPLE__ */ /* * There's nothing to do here but assert that we have actually been * disabled. */ if (SYSTRACE_ISENTRY((uintptr_t)parg)) { ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE); } else { ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); } } /*ARGSUSED*/ static int systrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE || systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE); if (SYSTRACE_ISENTRY((uintptr_t)parg)) { systrace_sysent[sysnum].stsy_entry = id; } else { systrace_sysent[sysnum].stsy_return = id; } if (enabled) { ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); return 0; } lck_mtx_lock(&dtrace_systrace_lock); if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) { /* It is not possible to write to sysent[] directly because it is const. */ vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall); ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); return 0; } /*ARGSUSED*/ static void systrace_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE || systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { /* * Usage of volatile protects the if statement below from being optimized away. * * Compilers are clever and know that const array values can't change in time * and the if below is always false. That is because it can't see that DTrace * injects dtrace_systrace_syscall dynamically and violates constness of the * array. */ volatile const struct sysent *syscallent = &sysent[sysnum]; lck_mtx_lock(&dtrace_systrace_lock); if (syscallent->sy_callc == dtrace_systrace_syscall) { ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); } if (SYSTRACE_ISENTRY((uintptr_t)parg)) { systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE; } else { systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE; } } static dtrace_pattr_t systrace_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, }; static dtrace_pops_t systrace_pops = { .dtps_provide = systrace_provide, .dtps_provide_module = NULL, .dtps_enable = systrace_enable, .dtps_disable = systrace_disable, .dtps_suspend = NULL, .dtps_resume = NULL, .dtps_getargdesc = systrace_getargdesc, .dtps_getargval = systrace_getargval, .dtps_usermode = NULL, .dtps_destroy = systrace_destroy }; static int systrace_attach(dev_info_t *devi) { systrace_probe = (void*)&dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL, &systrace_pops, NULL, &systrace_id) != 0) { systrace_probe = systrace_stub; ddi_remove_minor_node(devi, NULL); return DDI_FAILURE; } return DDI_SUCCESS; } /* * APPLE NOTE: systrace_detach not implemented */ #if !defined(__APPLE__) static int systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) { switch (cmd) { case DDI_DETACH: break; case DDI_SUSPEND: return DDI_SUCCESS; default: return DDI_FAILURE; } if (dtrace_unregister(systrace_id) != 0) { return DDI_FAILURE; } ddi_remove_minor_node(devi, NULL); systrace_probe = systrace_stub; return DDI_SUCCESS; } #endif /* __APPLE__ */ typedef kern_return_t (*mach_call_t)(void *); /* APPLE NOTE: From #include which may be changed for 64 bit! */ #if CONFIG_REQUIRES_U32_MUNGING typedef void mach_munge_t(void *); #elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4) typedef int mach_munge_t(const void *, void *); #endif typedef struct { unsigned char mach_trap_arg_count; /* Number of trap arguments (Arch independant) */ unsigned char mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */ unsigned char mach_trap_returns_port; unsigned char __mach_trap_padding; kern_return_t (*mach_trap_function)(void *); #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4)) mach_munge_t *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */ #endif #if MACH_ASSERT const char *mach_trap_name; #endif /* MACH_ASSERT */ } mach_trap_t; #define MACH_TRAP_TABLE_COUNT 128 extern const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT]; extern const int mach_trap_count; extern const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT]; /* XXX From osfmk/i386/bsd_i386.c */ struct mach_call_args { syscall_arg_t arg1; syscall_arg_t arg2; syscall_arg_t arg3; syscall_arg_t arg4; syscall_arg_t arg5; syscall_arg_t arg6; syscall_arg_t arg7; syscall_arg_t arg8; syscall_arg_t arg9; }; #undef NSYSCALL #define NSYSCALL mach_trap_count #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL) #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps #endif typedef struct machtrace_sysent { dtrace_id_t stsy_entry; dtrace_id_t stsy_return; kern_return_t (*stsy_underlying)(void *); int32_t stsy_return_type; } machtrace_sysent_t; static machtrace_sysent_t *machtrace_sysent = NULL; void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int); static dtrace_provider_id_t machtrace_id; static kern_return_t dtrace_machtrace_syscall(struct mach_call_args *args) { int code; /* The mach call number */ machtrace_sysent_t *sy; dtrace_id_t id; kern_return_t rval; #if 0 /* XXX */ proc_t *p; #endif syscall_arg_t *ip = (syscall_arg_t *)args; mach_call_t mach_call; #if defined (__x86_64__) { pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK; } else { code = -saved_state32(tagged_regs)->eax; } } #elif defined(__arm64__) { /* From arm/thread_status.h:get_saved_state_svc_number */ arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread()); if (is_saved_state32(arm_regs)) { code = (int)saved_state32(arm_regs)->r[12]; } else { code = (int)saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM]; } /* From bsd/arm64.c:mach_syscall */ ASSERT(code < 0); /* Otherwise it would be a Unix syscall */ code = -code; } #else #error Unknown Architecture #endif sy = &machtrace_sysent[code]; if ((id = sy->stsy_entry) != DTRACE_IDNONE) { uthread_t uthread = current_uthread(); if (uthread) { uthread->t_dtrace_syscall_args = (void *)ip; } (*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4)); if (uthread) { uthread->t_dtrace_syscall_args = (void *)0; } } #if 0 /* XXX */ /* * APPLE NOTE: Not implemented. * We want to explicitly allow DTrace consumers to stop a process * before it actually executes the meat of the syscall. */ p = ttoproc(curthread); mutex_enter(&p->p_lock); if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) { curthread->t_dtrace_stop = 0; stop(PR_REQUESTED, 0); } mutex_exit(&p->p_lock); #endif mach_call = (mach_call_t)(*sy->stsy_underlying); rval = mach_call(args); if ((id = sy->stsy_return) != DTRACE_IDNONE) { (*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0); } return rval; } static void machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed) { machtrace_sysent_t *msysent = *interposed; int i; if (msysent == NULL) { *interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) * NSYSCALL, KM_SLEEP); } for (i = 0; i < NSYSCALL; i++) { const volatile mach_trap_t *a = &actual[i]; machtrace_sysent_t *s = &msysent[i]; if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) { continue; } if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) { continue; } s->stsy_underlying = a->mach_trap_function; } } /*ARGSUSED*/ static void machtrace_provide(void *arg, const dtrace_probedesc_t *desc) { #pragma unused(arg) /* __APPLE__ */ int i; if (desc != NULL) { return; } machtrace_init(mach_trap_table, &machtrace_sysent); for (i = 0; i < NSYSCALL; i++) { if (machtrace_sysent[i].stsy_underlying == NULL) { continue; } if (dtrace_probe_lookup(machtrace_id, NULL, mach_syscall_name_table[i], "entry") != 0) { continue; } (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i], "entry", MACHTRACE_ARTIFICIAL_FRAMES, (void *)((uintptr_t)SYSTRACE_ENTRY(i))); (void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i], "return", MACHTRACE_ARTIFICIAL_FRAMES, (void *)((uintptr_t)SYSTRACE_RETURN(i))); machtrace_sysent[i].stsy_entry = DTRACE_IDNONE; machtrace_sysent[i].stsy_return = DTRACE_IDNONE; } } /*ARGSUSED*/ static void machtrace_destroy(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); #pragma unused(sysnum) /* __APPLE__ */ /* * There's nothing to do here but assert that we have actually been * disabled. */ if (SYSTRACE_ISENTRY((uintptr_t)parg)) { ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE); } else { ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); } } /*ARGSUSED*/ static int machtrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE || machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE); if (SYSTRACE_ISENTRY((uintptr_t)parg)) { machtrace_sysent[sysnum].stsy_entry = id; } else { machtrace_sysent[sysnum].stsy_return = id; } if (enabled) { ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall); return 0; } lck_mtx_lock(&dtrace_systrace_lock); if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) { /* It is not possible to write to mach_trap_table[] directly because it is const. */ vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall); ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); return 0; } /*ARGSUSED*/ static void machtrace_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE || machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { /* * Usage of volatile protects the if statement below from being optimized away. * * Compilers are clever and know that const array values can't change in time * and the if below is always false. That is because it can't see that DTrace * injects dtrace_machtrace_syscall dynamically and violates constness of the * array. */ volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum]; lck_mtx_lock(&dtrace_systrace_lock); if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) { ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); } if (SYSTRACE_ISENTRY((uintptr_t)parg)) { machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE; } else { machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE; } } static dtrace_pattr_t machtrace_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, }; static dtrace_pops_t machtrace_pops = { .dtps_provide = machtrace_provide, .dtps_provide_module = NULL, .dtps_enable = machtrace_enable, .dtps_disable = machtrace_disable, .dtps_suspend = NULL, .dtps_resume = NULL, .dtps_getargdesc = NULL, .dtps_getargval = machtrace_getarg, .dtps_usermode = NULL, .dtps_destroy = machtrace_destroy }; static int machtrace_attach(dev_info_t *devi) { machtrace_probe = dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL, &machtrace_pops, NULL, &machtrace_id) != 0) { machtrace_probe = (void*)&systrace_stub; ddi_remove_minor_node(devi, NULL); return DDI_FAILURE; } return DDI_SUCCESS; } d_open_t _systrace_open; int _systrace_open(dev_t dev, int flags, int devtype, struct proc *p) { #pragma unused(dev,flags,devtype,p) return 0; } #define SYSTRACE_MAJOR -24 /* let the kernel pick the device number */ static struct cdevsw systrace_cdevsw = { .d_open = _systrace_open, .d_close = eno_opcl, .d_read = eno_rdwrt, .d_write = eno_rdwrt, .d_ioctl = eno_ioctl, .d_stop = eno_stop, .d_reset = eno_reset, .d_select = eno_select, .d_mmap = eno_mmap, .d_strategy = eno_strat, .d_reserved_1 = eno_getc, .d_reserved_2 = eno_putc, }; void systrace_init( void ); void systrace_init( void ) { if (dtrace_sdt_probes_restricted()) { return; } int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); if (majdevno < 0) { printf("systrace_init: failed to allocate a major number!\n"); return; } systrace_attach((dev_info_t*)(uintptr_t)majdevno); machtrace_attach((dev_info_t*)(uintptr_t)majdevno); } #undef SYSTRACE_MAJOR static uint64_t systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { #pragma unused(arg,id,parg,aframes) /* __APPLE__ */ uint64_t val = 0; uint64_t *uargs = NULL; uthread_t uthread = current_uthread(); if (uthread) { uargs = uthread->t_dtrace_syscall_args; } if (!uargs) { return 0; } if (argno < 0 || argno >= SYSTRACE_NARGS) { return 0; } DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); val = uargs[argno]; DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); return val; } static void systrace_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) { #pragma unused(arg, id) int sysnum = SYSTRACE_SYSNUM(parg); uthread_t uthread = current_uthread(); uint64_t *uargs = NULL; if (!uthread) { desc->dtargd_ndx = DTRACE_ARGNONE; return; } uargs = uthread->t_dtrace_syscall_args; if (SYSTRACE_ISENTRY((uintptr_t)parg)) { systrace_entry_setargdesc(sysnum, desc->dtargd_ndx, desc->dtargd_native, sizeof(desc->dtargd_native)); } else { systrace_return_setargdesc(sysnum, desc->dtargd_ndx, desc->dtargd_native, sizeof(desc->dtargd_native)); } if (desc->dtargd_native[0] == '\0') { desc->dtargd_ndx = DTRACE_ARGNONE; } } static uint64_t machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { #pragma unused(arg,id,parg,aframes) /* __APPLE__ */ uint64_t val = 0; syscall_arg_t *stack = (syscall_arg_t *)NULL; uthread_t uthread = current_uthread(); if (uthread) { stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args; } if (!stack) { return 0; } if (argno < 0 || argno >= MACHTRACE_NARGS) { return 0; } DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ val = (uint64_t)*(stack + argno); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); return val; }