mirror of
https://github.com/opnsense/src.git
synced 2026-02-27 03:40:37 -05:00
An overread condition in memccpy(dst, src, c, len) would occur if
src does not cross a 16 byte boundary and there is no instance of
c between *src and the next 16 byte boundary. This could cause a
read fault if src is just before the end of a page and the next page
is unmapped or unreadable.
The bug is a consequence of basing memccpy() on the strlcpy() code:
whereas strlcpy() assumes that src is a nul-terminated string and
hence a terminator is always present, c may not be present at all in
the source string. It was not caught earlier due to insufficient
unit test design.
As a part of the fix, the function is refactored such that the runt
case (buffer length from last alignment boundary between 1 and 32 B)
is handled separately. This reduces the number of conditional
branches on all code paths and simplifies the handling of early
matches in the non-runt case. Performance is improved slightly.
os: FreeBSD
arch: amd64
cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
│ memccpy.unfixed.out │ memccpy.fixed.out │
│ sec/op │ sec/op vs base │
Short 66.76µ ± 0% 62.45µ ± 1% -6.44% (p=0.000 n=20)
Mid 7.938µ ± 0% 7.967µ ± 0% +0.36% (p=0.001 n=20)
Long 3.577µ ± 0% 3.577µ ± 0% ~ (p=0.429 n=20)
geomean 12.38µ 12.12µ -2.08%
│ memccpy.unfixed.out │ memccpy.fixed.out │
│ B/s │ B/s vs base │
Short 1.744Gi ± 0% 1.864Gi ± 1% +6.89% (p=0.000 n=20)
Mid 14.67Gi ± 0% 14.61Gi ± 0% -0.36% (p=0.001 n=20)
Long 32.55Gi ± 0% 32.55Gi ± 0% ~ (p=0.429 n=20)
geomean 9.407Gi 9.606Gi +2.12%
Reported by: getz
Reviewed by: getz
Approved by: mjg (blanket, via IRC)
See also: D46051
MFC: stable/14
Event: GSoC 2024
Differential Revision: https://reviews.freebsd.org/D46052
260 lines
7.8 KiB
ArmAsm
260 lines
7.8 KiB
ArmAsm
/*
|
|
* Copyright (c) 2023, 2024 The FreeBSD Foundation
|
|
*
|
|
* This software was developed by Robert Clausecker <fuz@FreeBSD.org>
|
|
* under sponsorship from the FreeBSD Foundation.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE
|
|
*/
|
|
|
|
#include <machine/asm.h>
|
|
|
|
#include "amd64_archlevel.h"
|
|
|
|
#define ALIGN_TEXT .p2align 4, 0x90
|
|
|
|
.weak memccpy
|
|
.set memccpy, __memccpy
|
|
ARCHFUNCS(__memccpy)
|
|
ARCHFUNC(__memccpy, scalar)
|
|
ARCHFUNC(__memccpy, baseline)
|
|
ENDARCHFUNCS(__memccpy)
|
|
|
|
ARCHENTRY(__memccpy, scalar)
|
|
push %rbp # establish stack frame
|
|
mov %rsp, %rbp
|
|
push %rax # dummy push for alignment
|
|
push %rbx
|
|
push %rdi
|
|
push %rsi
|
|
|
|
mov %rsi, %rdi
|
|
mov %edx, %esi
|
|
mov %rcx, %rdx
|
|
mov %rcx, %rbx
|
|
call CNAME(__memchr) # ptr = memchr(src, c, len)
|
|
|
|
pop %rsi
|
|
pop %rdi
|
|
lea 1(%rax), %rdx
|
|
sub %rsi, %rdx # size = ptr - src + 1
|
|
mov %rbx, %rcx
|
|
lea (%rdi, %rdx, 1), %rbx # res = dest + size
|
|
test %rax, %rax # if (ptr == NULL)
|
|
cmovz %rcx, %rdx # size = len
|
|
cmovz %rax, %rbx # res = NULL
|
|
call CNAME(memcpy)
|
|
|
|
mov %rbx, %rax # return (res)
|
|
pop %rbx
|
|
leave
|
|
ret
|
|
ARCHEND(__memccpy, scalar)
|
|
|
|
ARCHENTRY(__memccpy, baseline)
|
|
sub $1, %rcx # RCX refers to last character in buffer
|
|
jb .L0 # go to special code path if len was 0
|
|
|
|
movd %edx, %xmm4
|
|
mov %rcx, %rdx
|
|
punpcklbw %xmm4, %xmm4 # c -> cc
|
|
mov %esi, %ecx
|
|
punpcklwd %xmm4, %xmm4 # cc -> cccc
|
|
mov %rsi, %r9 # stash a copy of the source pointer for later
|
|
pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc
|
|
and $~0xf, %rsi
|
|
movdqa %xmm4, %xmm1
|
|
pcmpeqb (%rsi), %xmm1 # c found in head?
|
|
and $0xf, %ecx
|
|
mov $-1, %eax
|
|
pmovmskb %xmm1, %r8d
|
|
lea -32(%rcx), %r11
|
|
shl %cl, %eax # mask of bytes in the string
|
|
add %rdx, %r11 # distance from alignment boundary - 32
|
|
jnc .Lrunt # jump if buffer length is 32 or less
|
|
|
|
and %r8d, %eax
|
|
jz 0f # match (or induced match) found?
|
|
|
|
/* match in first chunk */
|
|
tzcnt %eax, %edx # where is c?
|
|
sub %ecx, %edx # ... from the beginning of the string?
|
|
lea 1(%rdi, %rdx, 1), %rax # return value
|
|
jmp .L0116
|
|
|
|
0: movdqa 16(%rsi), %xmm3 # load second string chunk
|
|
movdqu (%r9), %xmm2 # load unaligned string head
|
|
movdqa %xmm4, %xmm1
|
|
pcmpeqb %xmm3, %xmm1 # c found in second chunk?
|
|
|
|
/* process second chunk */
|
|
pmovmskb %xmm1, %eax
|
|
test %eax, %eax
|
|
jz 0f
|
|
|
|
/* match in second chunk */
|
|
tzcnt %eax, %edx # where is c?
|
|
sub $16, %ecx
|
|
sub %ecx, %edx # adjust for alignment offset
|
|
lea 1(%rdi, %rdx, 1), %rax # return value
|
|
jmp .L0132
|
|
|
|
/* c not found in second chunk: prepare for main loop */
|
|
0: movdqa 32(%rsi), %xmm0 # load next string chunk
|
|
movdqa %xmm4, %xmm1
|
|
movdqu %xmm2, (%rdi) # deposit head into buffer
|
|
sub %rcx, %rdi # adjust RDI to correspond to RSI
|
|
mov %r11, %rdx
|
|
movdqu %xmm3, 16(%rdi) # deposit second chunk
|
|
sub %rsi, %rdi # express RDI as distance from RSI
|
|
add $32, %rsi # advance RSI past first two chunks
|
|
sub $16, %rdx # enough left for another round?
|
|
jb 1f
|
|
|
|
/* main loop unrolled twice */
|
|
ALIGN_TEXT
|
|
0: pcmpeqb %xmm0, %xmm1 # c encountered?
|
|
pmovmskb %xmm1, %eax
|
|
test %eax, %eax
|
|
jnz 3f
|
|
|
|
movdqu %xmm0, (%rsi, %rdi)
|
|
movdqa 16(%rsi), %xmm0 # load next string chunk
|
|
movdqa %xmm4, %xmm1
|
|
cmp $16, %rdx # more than a full chunk left?
|
|
jb 2f
|
|
|
|
add $32, %rsi # advance pointers to next chunk
|
|
pcmpeqb %xmm0, %xmm1 # c encountered?
|
|
pmovmskb %xmm1, %eax
|
|
test %eax, %eax
|
|
jnz 4f
|
|
|
|
movdqu %xmm0, -16(%rsi, %rdi)
|
|
movdqa (%rsi), %xmm0 # load next string chunk
|
|
movdqa %xmm4, %xmm1
|
|
sub $32, %rdx
|
|
jae 0b
|
|
|
|
1: sub $16, %rsi # undo second advancement
|
|
add $16, %edx
|
|
|
|
/* 1--16 bytes left in the buffer but string has not ended yet */
|
|
2: pcmpeqb %xmm1, %xmm0 # c encountered?
|
|
pmovmskb %xmm0, %r8d
|
|
mov %r8d, %ecx
|
|
bts %edx, %r8d # treat end of buffer as end of string
|
|
tzcnt %r8d, %r8d # find tail length
|
|
add %rsi, %rdi # restore RDI
|
|
movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail
|
|
movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail
|
|
lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered
|
|
xor %eax, %eax # return value if no terminator encountered
|
|
bt %r8d, %ecx # terminator encountered inside buffer?
|
|
cmovc %rsi, %rax # if yes, return pointer, else NULL
|
|
ret
|
|
|
|
4: sub $16, %rsi # undo second advancement
|
|
|
|
/* terminator found and buffer has not ended yet */
|
|
3: tzcnt %eax, %eax # find length of string tail
|
|
movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
|
|
add %rsi, %rdi # restore destination pointer
|
|
movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
|
|
lea 1(%rdi, %rax, 1), %rax # compute return value
|
|
ret
|
|
|
|
/* buffer is 1--32 bytes in size */
|
|
ALIGN_TEXT
|
|
.Lrunt: add $32, %r11d # undo earlier decrement
|
|
mov %r8d, %r10d # keep a copy of the original match mask
|
|
bts %r11d, %r8d # induce match at buffer end
|
|
and %ax, %r8w # is there a match in the first 16 bytes?
|
|
jnz 0f # if yes, skip looking at second chunk
|
|
|
|
pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk
|
|
pmovmskb %xmm4, %r8d
|
|
shl $16, %r8d # place second chunk matches in bits 16--31
|
|
mov %r8d, %r10d # keep a copy of the original match mask
|
|
bts %r11d, %r8d # induce a match at buffer end
|
|
|
|
0: xor %eax, %eax # return value if terminator not found
|
|
tzcnt %r8d, %edx # find string/buffer length from alignment boundary
|
|
lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx
|
|
sub %rcx, %r8
|
|
bt %edx, %r10d # was the terminator present?
|
|
cmovc %r8, %rax # if yes, return pointer, else NULL
|
|
sub %ecx, %edx # find actual string/buffer length
|
|
|
|
ALIGN_TEXT
|
|
.L0132: cmp $16, %rdx # at least 17 bytes to copy?
|
|
jb .L0116
|
|
|
|
/* copy 17--32 bytes */
|
|
movdqu (%r9), %xmm0 # load first 16 bytes
|
|
movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
|
|
movdqu %xmm0, (%rdi)
|
|
movdqu %xmm1, -15(%rdi, %rdx, 1)
|
|
ret
|
|
|
|
/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
|
|
ALIGN_TEXT
|
|
.L0116: cmp $8, %rdx # at least 9 bytes to copy?
|
|
jae .L0916
|
|
|
|
cmp $4, %rdx # at least 5 bytes to copy?
|
|
jae .L0508
|
|
|
|
cmp $2, %rdx # at least 3 bytes to copy?
|
|
jae .L0304
|
|
|
|
/* copy one or two bytes */
|
|
movzbl (%r9), %ecx # load first byte from src
|
|
movzbl (%r9, %rdx, 1), %esi # load last byte from src
|
|
mov %cl, (%rdi) # deposit into destination
|
|
mov %sil, (%rdi, %rdx, 1)
|
|
ret
|
|
|
|
.L0304: movzwl (%r9), %ecx
|
|
movzwl -1(%r9, %rdx, 1), %esi
|
|
mov %cx, (%rdi)
|
|
mov %si, -1(%rdi, %rdx, 1)
|
|
ret
|
|
|
|
.L0508: mov (%r9), %ecx
|
|
mov -3(%r9, %rdx, 1), %esi
|
|
mov %ecx, (%rdi)
|
|
mov %esi, -3(%rdi, %rdx, 1)
|
|
ret
|
|
|
|
.L0916: mov (%r9), %rcx
|
|
mov -7(%r9, %rdx, 1), %rsi
|
|
mov %rcx, (%rdi)
|
|
mov %rsi, -7(%rdi, %rdx, 1)
|
|
ret
|
|
|
|
/* length zero destination: return null pointer */
|
|
.L0: xor %eax, %eax
|
|
ret
|
|
ARCHEND(__memccpy, baseline)
|
|
|
|
.section .note.GNU-stack,"",%progbits
|