gems-kernel/source/THIRDPARTY/xnu/bsd/dev/arm64/cpu_in_cksum.s
2024-06-03 11:29:39 -05:00

455 lines
10 KiB
ArmAsm

/*
* Copyright (c) 2012-2018 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
* with __arm64__ tagged ARM64_TODO . This code revision is optimized based
* on the 64-bit part in netinet/cpu_in_cksum.c
*
* cclee - CoreOS - Vector & Numerics. 06/20/2012.
*/
#ifdef KERNEL
#define CKSUM_ERR _kprintf
#else
#ifndef LIBSYSCALL_INTERFACE
#error "LIBSYSCALL_INTERFACE not defined"
#endif /* !LIBSYSCALL_INTERFACE */
#define CKSUM_ERR _fprintf_stderr
#endif /* !KERNEL */
/*
* XXX: adi@apple.com:
*
* Ugly, but we have little choice, since relying on genassym and <assym.s>
* is not possible unless this code lives in osfmk. Note also that this
* routine expects "mbuf-like" argument, and it does not expect the mbuf to be
* authentic; it only cares about 3 fields.
*/
#if defined(__LP64__)
#define M_NEXT 0
#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
#define M_LEN 24
#else
#define M_NEXT 0
#define M_DATA 8
#define M_LEN 12
#endif
.globl _os_cpu_in_cksum_mbuf
.text
.align 4
_os_cpu_in_cksum_mbuf:
/*
* 64-bit version.
*
* This function returns the partial 16-bit checksum accumulated in
* a 32-bit variable (withouth 1's complement); caller is responsible
* for folding the 32-bit sum into 16-bit and performinng the 1's
* complement if applicable
*/
/*
* uint32_t
* os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
* {
* int mlen;
* uint64_t sum, partial;
* unsigned int final_acc;
* uint8_t *data;
* boolean_t needs_swap, started_on_odd;
*
* VERIFY(len >= 0);
* VERIFY(off >= 0);
*
* needs_swap = FALSE;
* started_on_odd = FALSE;
* sum = initial_sum;
*/
#define m x0
#define len x1
#define off x2
#define sum x3
#define needs_swap x4
#define started_on_odd x5
#define mlen x6
#define Wmlen w6
#define t x7
#define data x8
#if defined(__LP64__)
#define ptr_m x0
#define ptr_data x8
#else
#define ptr_m w0
#define ptr_data w8
#endif
mov needs_swap, #0 // needs_swap = FALSE;
mov started_on_odd, #0 // started_on_odd = FALSE;
mov w3, w3 // clear higher half
/*
* for (;;) {
* if (PREDICT_FALSE(m == NULL)) {
* CKSUM_ERR("%s: out of data\n", __func__);
* return (-1);
* }
* mlen = m->m_len;
* if (mlen > off) {
* mlen -= off;
* data = mtod(m, uint8_t *) + off;
* goto post_initial_offset;
* }
* off -= mlen;
* if (len == 0)
* break;
* m = m->m_next;
* }
*/
0:
cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
cmp mlen, off
b.le 1f
ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
sub mlen, mlen, off // mlen -= off;
add data, data, off // data = mtod(m, uint8_t *) + off;
b L_post_initial_offset
1:
sub off, off, mlen
cbnz len, 2f
mov x0, x3
ret lr
2:
ldr ptr_m, [m, #M_NEXT]
b 0b
L_loop: // for (; len > 0; m = m->m_next) {
/*
* if (PREDICT_FALSE(m == NULL)) {
* CKSUM_ERR("%s: out of data\n", __func__);
* return (-1);
* }
* mlen = m->m_len;
* data = mtod(m, uint8_t *);
*/
cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
L_post_initial_offset:
/*
* if (mlen == 0) continue;
* if (mlen > len) mlen = len;
* len -= mlen;
*/
cbz mlen, L_continue
cmp mlen, len
csel mlen, mlen, len, le
sub len, len, mlen
/*
* partial = 0;
* if ((uintptr_t)data & 1) {
* started_on_odd = !started_on_odd;
* partial = *data << 8;
* ++data;
* --mlen;
* }
* needs_swap = started_on_odd;
*/
tst data, #1
mov x7, #0
mov x10, #0
b.eq 1f
ldrb w7, [data], #1
eor started_on_odd, started_on_odd, #1
sub mlen, mlen, #1
lsl w7, w7, #8
1:
/*
* if ((uintptr_t)data & 2) {
* if (mlen < 2)
* goto trailing_bytes;
* partial += *(uint16_t *)(void *)data;
* data += 2;
* mlen -= 2;
* }
*/
tst data, #2
mov needs_swap, started_on_odd
b.eq 1f
cmp mlen, #2
b.lt L_trailing_bytes
ldrh w9, [data], #2
sub mlen, mlen, #2
add w7, w7, w9
1:
/*
* if ((uintptr_t)data & 4) {
* if (mlen < 4)
* goto L2_bytes;
* partial += *(uint32_t *)(void *)data;
* data += 4;
* mlen -= 4;
* }
*/
// align on 8-bytes boundary if applicable
tst data, #4
b.eq 1f
cmp mlen, #4
b.lt L2_bytes
ldr w9, [data], #4
sub mlen, mlen, #4
adds w7, w7, w9
adc x7, x7, x10 // assumes x10 still is #0 as set above
1:
/*
* while (mlen >= 64) {
* __builtin_prefetch(data + 32);
* __builtin_prefetch(data + 64);
* partial += *(uint32_t *)(void *)data;
* partial += *(uint32_t *)(void *)(data + 4);
* partial += *(uint32_t *)(void *)(data + 8);
* partial += *(uint32_t *)(void *)(data + 12);
* partial += *(uint32_t *)(void *)(data + 16);
* partial += *(uint32_t *)(void *)(data + 20);
* partial += *(uint32_t *)(void *)(data + 24);
* partial += *(uint32_t *)(void *)(data + 28);
* partial += *(uint32_t *)(void *)(data + 32);
* partial += *(uint32_t *)(void *)(data + 36);
* partial += *(uint32_t *)(void *)(data + 40);
* partial += *(uint32_t *)(void *)(data + 44);
* partial += *(uint32_t *)(void *)(data + 48);
* partial += *(uint32_t *)(void *)(data + 52);
* partial += *(uint32_t *)(void *)(data + 56);
* partial += *(uint32_t *)(void *)(data + 60);
* data += 64;
* mlen -= 64;
* // if (PREDICT_FALSE(partial & (3ULL << 62))) {
* // if (needs_swap)
* // partial = (partial << 8) +
* // (partial >> 56);
* // sum += (partial >> 32);
* // sum += (partial & 0xffffffff);
* // partial = 0;
* // }
* }
*/
// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
subs mlen, mlen, #64
b.lt L32_bytes
// save used vector registers
sub sp, sp, #8*16
mov x11, sp
st1.4s {v0, v1, v2, v3}, [x11], #4*16
st1.4s {v4, v5, v6, v7}, [x11], #4*16
// spread partial into 8 8-byte registers in v0-v3
fmov s3, w7
eor.16b v0, v0, v0
eor.16b v1, v1, v1
eor.16b v2, v2, v2
// load the 1st 64 bytes (16 32-bit words)
ld1.4s {v4,v5,v6,v7},[data],#64
// branch to finish off if mlen<64
subs mlen, mlen, #64
b.lt L64_finishup
/*
* loop for loading and accumulating 16 32-bit words into
* 8 8-byte accumulators per iteration.
*/
L64_loop:
subs mlen, mlen, #64 // mlen -= 64
uadalp.2d v0, v4
ld1.4s {v4},[data], #16
uadalp.2d v1, v5
ld1.4s {v5},[data], #16
uadalp.2d v2, v6
ld1.4s {v6},[data], #16
uadalp.2d v3, v7
ld1.4s {v7},[data], #16
b.ge L64_loop
L64_finishup:
uadalp.2d v0, v4
uadalp.2d v1, v5
uadalp.2d v2, v6
uadalp.2d v3, v7
add.2d v0, v0, v1
add.2d v2, v2, v3
addp.2d d0, v0
addp.2d d2, v2
add.2d v0, v0, v2
fmov x7, d0 // partial in x7 now
// restore used vector registers
ld1.4s {v0, v1, v2, v3}, [sp], #4*16
ld1.4s {v4, v5, v6, v7}, [sp], #4*16
L32_bytes:
tst mlen, #32
b.eq L16_bytes
ldp x9, x10, [data], #16
ldp x11, x12, [data], #16
adds x7, x7, x9
mov x9, #0
adcs x7, x7, x10
adcs x7, x7, x11
adcs x7, x7, x12
adc x7, x7, x9
L16_bytes:
tst mlen, #16
b.eq L8_bytes
ldp x9, x10, [data], #16
adds x7, x7, x9
mov x9, #0
adcs x7, x7, x10
adc x7, x7, x9
L8_bytes:
tst mlen, #8
mov x10, #0
b.eq L4_bytes
ldr x9,[data],#8
adds x7, x7, x9
adc x7, x7, x10
L4_bytes:
tst mlen, #4
b.eq L2_bytes
ldr w9,[data],#4
adds x7, x7, x9
adc x7, x7, x10
L2_bytes:
tst mlen, #2
b.eq L_trailing_bytes
ldrh w9,[data],#2
adds x7, x7, x9
adc x7, x7, x10
L_trailing_bytes:
tst mlen, #1
b.eq L0_bytes
ldrb w9,[data],#1
adds x7, x7, x9
adc x7, x7, x10
eor started_on_odd, started_on_odd, #1
L0_bytes:
/*
* if (needs_swap)
* partial = (partial << 8) + (partial >> 56);
*/
cbz needs_swap, 1f
ror x7, x7, #56
1:
/*
* sum += (partial >> 32) + (partial & 0xffffffff);
* sum = (sum >> 32) + (sum & 0xffffffff);
* }
*/
add x3, x3, x7, lsr #32
mov w7, w7
add x3, x3, x7
mov w7, w3
add x3, x7, x3, lsr #32
L_continue:
cmp len, #0
ldr ptr_m, [m, #M_NEXT] // m = m->m_next
b.gt L_loop
/*
* final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
* ((sum >> 16) & 0xffff) + (sum & 0xffff);
* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
* return (final_acc & 0xffff);
* }
*/
mov w4, #0x00ffff
and x0, x4, x3, lsr #48
and x1, x4, x3, lsr #32
and x2, x4, x3, lsr #16
and x3, x4, x3
add w0, w0, w1
add w2, w2, w3
add w0, w0, w2
and w1, w4, w0, lsr #16
and w0, w4, w0
add w0, w0, w1
and w1, w4, w0, lsr #16
and w0, w4, w0
add w0, w0, w1
/*
* If we were to 1's complement it (XOR with 0xffff):
*
* eor w0, w0, w4
*/
and w0, w0, w4
ret lr
Lin_cksum_whoops:
adrp x0, Lin_cksum_whoops_str@page
add x0, x0, Lin_cksum_whoops_str@pageoff
bl #CKSUM_ERR
mov x0, #-1
ret lr
Lin_cksum_whoops_str:
.asciz "os_cpu_in_cksum_mbuf: out of data\n"
.align 5