mirror of
https://github.com/opnsense/src.git
synced 2026-02-17 09:39:26 -05:00
131 lines
2.9 KiB
ArmAsm
131 lines
2.9 KiB
ArmAsm
/*-
|
|
* Written by Mateusz Guzik <mjg@freebsd.org>
|
|
* Copyright (c) 2023 The FreeBSD Foundation
|
|
*
|
|
* Portions of this software were developed by Robert Clausecker
|
|
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
|
|
*
|
|
* Public domain.
|
|
*/
|
|
|
|
#include <machine/asm.h>
|
|
#include "amd64_archlevel.h"
|
|
|
|
/*
|
|
* Note: this routine was written with kernel use in mind (read: no simd),
|
|
* it is only present in userspace as a temporary measure until something
|
|
* better gets imported.
|
|
*/
|
|
|
|
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
|
|
|
|
ARCHFUNCS(strlen)
|
|
ARCHFUNC(strlen, scalar)
|
|
ARCHFUNC(strlen, baseline)
|
|
ENDARCHFUNCS(strlen)
|
|
|
|
/*
|
|
* strlen(string)
|
|
* %rdi
|
|
*
|
|
* Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
|
|
*
|
|
* 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
|
|
* with leaq.
|
|
*
|
|
* For a description see either:
|
|
* - "Hacker's Delight" by Henry S. Warren, Jr.
|
|
* - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
|
|
* by Agner Fog
|
|
*
|
|
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
|
|
*/
|
|
ARCHENTRY(strlen, scalar)
|
|
movabsq $0xfefefefefefefeff,%r8
|
|
movabsq $0x8080808080808080,%r9
|
|
|
|
movq %rdi,%r10
|
|
movq %rdi,%rcx
|
|
testb $7,%dil
|
|
jz 2f
|
|
|
|
/*
|
|
* Handle misaligned reads: align to 8 and fill
|
|
* the spurious bytes.
|
|
*/
|
|
andq $~7,%rdi
|
|
movq (%rdi),%r11
|
|
shlq $3,%rcx
|
|
movq $-1,%rdx
|
|
shlq %cl,%rdx
|
|
notq %rdx
|
|
orq %rdx,%r11
|
|
|
|
leaq (%r11,%r8),%rcx
|
|
notq %r11
|
|
andq %r11,%rcx
|
|
andq %r9,%rcx
|
|
jnz 3f
|
|
|
|
/*
|
|
* Main loop.
|
|
*/
|
|
ALIGN_TEXT
|
|
1:
|
|
leaq 8(%rdi),%rdi
|
|
2:
|
|
movq (%rdi),%r11
|
|
leaq (%r11,%r8),%rcx
|
|
notq %r11
|
|
andq %r11,%rcx
|
|
andq %r9,%rcx
|
|
jz 1b
|
|
3:
|
|
bsfq %rcx,%rcx
|
|
shrq $3,%rcx
|
|
leaq (%rcx,%rdi),%rax
|
|
subq %r10,%rax
|
|
ret
|
|
ARCHEND(strlen, scalar)
|
|
|
|
ARCHENTRY(strlen, baseline)
|
|
mov %rdi, %rcx
|
|
pxor %xmm1, %xmm1
|
|
and $~0xf, %rdi # align string
|
|
pcmpeqb (%rdi), %xmm1 # compare head (with junk before string)
|
|
mov %rcx, %rsi # string pointer copy for later
|
|
and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment
|
|
pmovmskb %xmm1, %eax
|
|
add $32, %rdi # advance to next iteration
|
|
shr %cl, %eax # clear out matches in junk bytes
|
|
test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible)
|
|
jnz 2f
|
|
|
|
ALIGN_TEXT
|
|
1: pxor %xmm1, %xmm1
|
|
pcmpeqb -16(%rdi), %xmm1 # find NUL bytes
|
|
pmovmskb %xmm1, %eax
|
|
test %eax, %eax # were any NUL bytes present?
|
|
jnz 3f
|
|
|
|
/* the same unrolled once more */
|
|
pxor %xmm1, %xmm1
|
|
pcmpeqb (%rdi), %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
add $32, %rdi # advance to next iteration
|
|
test %eax, %eax
|
|
jz 1b
|
|
|
|
/* match found in loop body */
|
|
sub $16, %rdi # undo half the advancement
|
|
3: tzcnt %eax, %eax # find the first NUL byte
|
|
sub %rsi, %rdi # string length until beginning of (%rdi)
|
|
lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length
|
|
ret
|
|
|
|
/* match found in head */
|
|
2: tzcnt %eax, %eax # compute string length
|
|
ret
|
|
ARCHEND(strlen, baseline)
|
|
|
|
.section .note.GNU-stack,"",%progbits
|