[59747] | 1 | ; $Id: ASMMemFirstMismatchingU8.asm 98103 2023-01-17 14:15:46Z vboxsync $
|
---|
| 2 | ;; @file
|
---|
| 3 | ; IPRT - ASMMemFirstMismatchingU8().
|
---|
| 4 | ;
|
---|
| 5 |
|
---|
| 6 | ;
|
---|
[98103] | 7 | ; Copyright (C) 2006-2023 Oracle and/or its affiliates.
|
---|
[59747] | 8 | ;
|
---|
[96407] | 9 | ; This file is part of VirtualBox base platform packages, as
|
---|
| 10 | ; available from https://www.virtualbox.org.
|
---|
[59747] | 11 | ;
|
---|
[96407] | 12 | ; This program is free software; you can redistribute it and/or
|
---|
| 13 | ; modify it under the terms of the GNU General Public License
|
---|
| 14 | ; as published by the Free Software Foundation, in version 3 of the
|
---|
| 15 | ; License.
|
---|
| 16 | ;
|
---|
| 17 | ; This program is distributed in the hope that it will be useful, but
|
---|
| 18 | ; WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
| 20 | ; General Public License for more details.
|
---|
| 21 | ;
|
---|
| 22 | ; You should have received a copy of the GNU General Public License
|
---|
| 23 | ; along with this program; if not, see <https://www.gnu.org/licenses>.
|
---|
| 24 | ;
|
---|
[59747] | 25 | ; The contents of this file may alternatively be used under the terms
|
---|
| 26 | ; of the Common Development and Distribution License Version 1.0
|
---|
[96407] | 27 | ; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
|
---|
| 28 | ; in the VirtualBox distribution, in which case the provisions of the
|
---|
[59747] | 29 | ; CDDL are applicable instead of those of the GPL.
|
---|
| 30 | ;
|
---|
| 31 | ; You may elect to license modified versions of this file under the
|
---|
| 32 | ; terms and conditions of either the GPL or the CDDL or both.
|
---|
| 33 | ;
|
---|
[96407] | 34 | ; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
|
---|
| 35 | ;
|
---|
[59747] | 36 |
|
---|
| 37 |
|
---|
| 38 | ;*******************************************************************************
|
---|
| 39 | ;* Header Files *
|
---|
| 40 | ;*******************************************************************************
|
---|
| 41 | %define RT_ASM_WITH_SEH64
|
---|
| 42 | %include "iprt/asmdefs.mac"
|
---|
| 43 |
|
---|
| 44 |
|
---|
| 45 | BEGINCODE
|
---|
| 46 |
|
---|
| 47 | ;;
|
---|
| 48 | ; Variant of ASMMemFirstMismatchingU8 with a fixed @a u8 value.
|
---|
| 49 | ; We repeat the prolog and join the generic function.
|
---|
| 50 | ;
|
---|
[92799] | 51 | RT_BEGINPROC ASMMemFirstNonZero
|
---|
[59747] | 52 | ;
|
---|
| 53 | ; Prologue.
|
---|
| 54 | ;
|
---|
| 55 | %if ARCH_BITS != 64
|
---|
| 56 | push xBP
|
---|
| 57 | mov xBP, xSP
|
---|
| 58 | push xDI
|
---|
| 59 | %if ARCH_BITS == 16
|
---|
| 60 | push es
|
---|
| 61 | %endif
|
---|
| 62 | %elifdef ASM_CALL64_MSC
|
---|
| 63 | mov r9, rdi ; save rdi in r9
|
---|
| 64 | %endif
|
---|
| 65 | SEH64_END_PROLOGUE
|
---|
| 66 |
|
---|
| 67 | ;
|
---|
| 68 | ; Normalize input; rdi=pv, rcx=cb, rax=0
|
---|
| 69 | ;
|
---|
| 70 | %if ARCH_BITS == 64
|
---|
| 71 | %ifdef ASM_CALL64_MSC
|
---|
| 72 | mov rdi, rcx
|
---|
| 73 | mov rcx, rdx
|
---|
| 74 | jrcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
|
---|
| 75 | xor eax, eax
|
---|
| 76 | %else
|
---|
| 77 | mov rcx, rsi
|
---|
| 78 | jrcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
|
---|
| 79 | xor eax, eax
|
---|
| 80 | %endif
|
---|
| 81 |
|
---|
| 82 | %elif ARCH_BITS == 32
|
---|
| 83 | mov ecx, [ebp + 0ch]
|
---|
| 84 | jecxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
|
---|
| 85 | mov edi, [ebp + 08h]
|
---|
| 86 | xor eax, eax
|
---|
| 87 |
|
---|
| 88 | %elif ARCH_BITS == 16
|
---|
| 89 | mov cx, [bp + 08h] ; cb
|
---|
| 90 | jcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return16_all_same)
|
---|
| 91 | les di, [bp + 04h] ; pv (far)
|
---|
| 92 | xor ax, ax
|
---|
| 93 |
|
---|
| 94 | %else
|
---|
| 95 | %error "Invalid ARCH_BITS value"
|
---|
| 96 | %endif
|
---|
| 97 |
|
---|
| 98 | ;
|
---|
| 99 | ; Join ASMMemFirstMismatchingU8
|
---|
| 100 | ;
|
---|
| 101 | jmp RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.is_all_zero_joining)
|
---|
| 102 | ENDPROC ASMMemFirstNonZero
|
---|
| 103 |
|
---|
| 104 |
|
---|
| 105 | ;;
|
---|
| 106 | ; Inverted memchr.
|
---|
| 107 | ;
|
---|
| 108 | ; @returns Pointer to the byte which doesn't equal u8.
|
---|
| 109 | ; @returns NULL if all equal to u8.
|
---|
| 110 | ;
|
---|
| 111 | ; @param msc:rcx gcc:rdi pv Pointer to the memory block.
|
---|
| 112 | ; @param msc:rdx gcc:rsi cb Number of bytes in the block. This MUST be aligned on 32-bit!
|
---|
| 113 | ; @param msc:r8b gcc:dl u8 The value it's supposed to be filled with.
|
---|
| 114 | ;
|
---|
| 115 | ; @cproto DECLINLINE(void *) ASMMemFirstMismatchingU8(void const *pv, size_t cb, uint8_t u8)
|
---|
| 116 | ;
|
---|
[92799] | 117 | RT_BEGINPROC ASMMemFirstMismatchingU8
|
---|
[59747] | 118 | ;
|
---|
| 119 | ; Prologue.
|
---|
| 120 | ;
|
---|
| 121 | %if ARCH_BITS != 64
|
---|
| 122 | push xBP
|
---|
| 123 | mov xBP, xSP
|
---|
| 124 | push xDI
|
---|
| 125 | %if ARCH_BITS == 16
|
---|
| 126 | push es
|
---|
| 127 | %endif
|
---|
| 128 | %elifdef ASM_CALL64_MSC
|
---|
| 129 | mov r9, rdi ; save rdi in r9
|
---|
| 130 | %endif
|
---|
| 131 | SEH64_END_PROLOGUE
|
---|
| 132 |
|
---|
| 133 | %if ARCH_BITS != 16
|
---|
| 134 | ;
|
---|
| 135 | ; The 32-bit and 64-bit variant of the code.
|
---|
| 136 | ;
|
---|
| 137 |
|
---|
| 138 | ; Normalize input; rdi=pv, rcx=cb, rax=eight-times-u8
|
---|
| 139 | %if ARCH_BITS == 64
|
---|
| 140 | %ifdef ASM_CALL64_MSC
|
---|
| 141 | mov rdi, rcx
|
---|
| 142 | mov rcx, rdx
|
---|
| 143 | jrcxz .return_all_same
|
---|
| 144 | movzx r8d, r8b
|
---|
| 145 | mov rax, qword 0101010101010101h
|
---|
| 146 | imul rax, r8
|
---|
| 147 | %else
|
---|
| 148 | mov rcx, rsi
|
---|
| 149 | jrcxz .return_all_same
|
---|
| 150 | movzx edx, dl
|
---|
| 151 | mov rax, qword 0101010101010101h
|
---|
| 152 | imul rax, rdx
|
---|
| 153 | %endif
|
---|
| 154 |
|
---|
| 155 | %elif ARCH_BITS == 32
|
---|
| 156 | mov ecx, [ebp + 0ch]
|
---|
| 157 | jecxz .return_all_same
|
---|
| 158 | mov edi, [ebp + 08h]
|
---|
| 159 | movzx eax, byte [ebp + 10h]
|
---|
| 160 | mov ah, al
|
---|
| 161 | movzx edx, ax
|
---|
| 162 | shl eax, 16
|
---|
| 163 | or eax, edx
|
---|
| 164 | %else
|
---|
| 165 | %error "Invalid ARCH_BITS value"
|
---|
| 166 | %endif
|
---|
| 167 |
|
---|
| 168 | .is_all_zero_joining:
|
---|
| 169 | cld
|
---|
| 170 |
|
---|
| 171 | ; Unaligned pointer? Align it (elsewhere).
|
---|
| 172 | test edi, xCB - 1
|
---|
| 173 | jnz .unaligned_pv
|
---|
| 174 | .aligned_pv:
|
---|
| 175 |
|
---|
| 176 | ; Do the dword/qword scan.
|
---|
| 177 | mov edx, xCB - 1
|
---|
| 178 | and edx, ecx ; Remaining bytes for tail scan
|
---|
[59764] | 179 | %if ARCH_BITS == 64
|
---|
[59747] | 180 | shr xCX, 3
|
---|
| 181 | repe scasq
|
---|
| 182 | %else
|
---|
[59764] | 183 | shr xCX, 2
|
---|
[59747] | 184 | repe scasd
|
---|
| 185 | %endif
|
---|
| 186 | jne .multibyte_mismatch
|
---|
| 187 |
|
---|
| 188 | ; Prep for tail scan.
|
---|
| 189 | mov ecx, edx
|
---|
| 190 |
|
---|
| 191 | ;
|
---|
| 192 | ; Byte by byte scan.
|
---|
| 193 | ;
|
---|
| 194 | .byte_by_byte:
|
---|
| 195 | repe scasb
|
---|
| 196 | jne .return_xDI
|
---|
| 197 |
|
---|
| 198 | .return_all_same:
|
---|
| 199 | xor eax, eax
|
---|
| 200 | %ifdef ASM_CALL64_MSC
|
---|
| 201 | mov rdi, r9 ; restore rdi
|
---|
| 202 | %elif ARCH_BITS == 32
|
---|
| 203 | pop edi
|
---|
| 204 | leave
|
---|
| 205 | %endif
|
---|
| 206 | ret
|
---|
| 207 |
|
---|
| 208 | ; Return after byte scan mismatch.
|
---|
| 209 | .return_xDI:
|
---|
| 210 | lea xAX, [xDI - 1]
|
---|
| 211 | %ifdef ASM_CALL64_MSC
|
---|
| 212 | mov rdi, r9 ; restore rdi
|
---|
| 213 | %elif ARCH_BITS == 32
|
---|
| 214 | pop edi
|
---|
| 215 | leave
|
---|
| 216 | %endif
|
---|
| 217 | ret
|
---|
| 218 |
|
---|
| 219 | ;
|
---|
| 220 | ; Multibyte mismatch. We rewind and do a byte scan of the remainder.
|
---|
| 221 | ; (can't just search the qword as the buffer must be considered volatile).
|
---|
| 222 | ;
|
---|
| 223 | .multibyte_mismatch:
|
---|
| 224 | lea xDI, [xDI - xCB]
|
---|
| 225 | lea xCX, [xCX * xCB + xCB]
|
---|
| 226 | or ecx, edx
|
---|
| 227 | jmp .byte_by_byte
|
---|
| 228 |
|
---|
| 229 | ;
|
---|
| 230 | ; Unaligned pointer. If it's worth it, align the pointer, but if the
|
---|
| 231 | ; memory block is too small do the byte scan variant.
|
---|
| 232 | ;
|
---|
| 233 | .unaligned_pv:
|
---|
| 234 | cmp xCX, 4*xCB ; 4 steps seems reasonable.
|
---|
| 235 | jbe .byte_by_byte
|
---|
| 236 |
|
---|
| 237 | ; Unrolled buffer realignment.
|
---|
| 238 | %if ARCH_BITS == 64
|
---|
| 239 | dec xCX
|
---|
| 240 | scasb
|
---|
| 241 | jne .return_xDI
|
---|
| 242 | test edi, xCB - 1
|
---|
| 243 | jz .aligned_pv
|
---|
| 244 |
|
---|
| 245 | dec xCX
|
---|
| 246 | scasb
|
---|
| 247 | jne .return_xDI
|
---|
| 248 | test edi, xCB - 1
|
---|
| 249 | jz .aligned_pv
|
---|
| 250 |
|
---|
| 251 | dec xCX
|
---|
| 252 | scasb
|
---|
| 253 | jne .return_xDI
|
---|
| 254 | test edi, xCB - 1
|
---|
| 255 | jz .aligned_pv
|
---|
| 256 |
|
---|
| 257 | dec xCX
|
---|
| 258 | scasb
|
---|
| 259 | jne .return_xDI
|
---|
| 260 | test edi, xCB - 1
|
---|
| 261 | jz .aligned_pv
|
---|
| 262 | %endif
|
---|
| 263 |
|
---|
| 264 | dec xCX
|
---|
| 265 | scasb
|
---|
| 266 | jne .return_xDI
|
---|
| 267 | test edi, xCB - 1
|
---|
| 268 | jz .aligned_pv
|
---|
| 269 |
|
---|
| 270 | dec xCX
|
---|
| 271 | scasb
|
---|
| 272 | jne .return_xDI
|
---|
| 273 | test edi, xCB - 1
|
---|
| 274 | jz .aligned_pv
|
---|
| 275 |
|
---|
| 276 | dec xCX
|
---|
| 277 | scasb
|
---|
| 278 | jne .return_xDI
|
---|
| 279 | jmp .aligned_pv
|
---|
| 280 |
|
---|
| 281 |
|
---|
| 282 | %else ; ARCH_BITS == 16
|
---|
| 283 |
|
---|
| 284 | ;
|
---|
| 285 | ; The 16-bit variant of the code is a little simpler since we're
|
---|
| 286 | ; working with two byte words in the 'fast' scan. We also keep
|
---|
| 287 | ; this separate from the 32-bit/64-bit code because that allows
|
---|
| 288 | ; avoid a few rex prefixes here and there by using extended
|
---|
| 289 | ; registers (e??) where we don't care about the whole register.
|
---|
| 290 | ;
|
---|
| 291 | CPU 8086
|
---|
| 292 |
|
---|
| 293 | ; Load input parameters.
|
---|
| 294 | mov cx, [bp + 08h] ; cb
|
---|
| 295 | jcxz .return16_all_same
|
---|
| 296 | les di, [bp + 04h] ; pv (far)
|
---|
| 297 | mov al, [bp + 0ah] ; u8
|
---|
[60658] | 298 | mov ah, al
|
---|
[59747] | 299 |
|
---|
| 300 | .is_all_zero_joining:
|
---|
| 301 | cld
|
---|
| 302 |
|
---|
| 303 | ; Align the pointer.
|
---|
| 304 | test di, 1
|
---|
| 305 | jz .word_scan
|
---|
| 306 |
|
---|
| 307 | dec cx
|
---|
| 308 | scasb
|
---|
| 309 | jne .return16_di
|
---|
| 310 | jcxz .return16_all_same
|
---|
| 311 |
|
---|
| 312 | ; Scan word-by-word.
|
---|
| 313 | .word_scan:
|
---|
| 314 | mov dx, cx
|
---|
| 315 | shr cx, 1
|
---|
| 316 | repe scasw
|
---|
| 317 | jne .word_mismatch
|
---|
| 318 |
|
---|
| 319 | ; do we have a tail byte?
|
---|
| 320 | test dl, 1
|
---|
| 321 | jz .return16_all_same
|
---|
| 322 | scasb
|
---|
| 323 | jne .return16_di
|
---|
| 324 |
|
---|
| 325 | .return16_all_same:
|
---|
| 326 | xor ax, ax
|
---|
[60095] | 327 | xor dx, dx
|
---|
[59747] | 328 | .return16:
|
---|
| 329 | pop es
|
---|
| 330 | pop di
|
---|
| 331 | pop bp
|
---|
| 332 | ret
|
---|
| 333 |
|
---|
| 334 | .word_mismatch:
|
---|
| 335 | ; back up a word.
|
---|
| 336 | inc cx
|
---|
| 337 | sub di, 2
|
---|
| 338 |
|
---|
| 339 | ; Do byte-by-byte scanning of the rest of the buffer.
|
---|
| 340 | shl cx, 1
|
---|
| 341 | mov dl, 1
|
---|
| 342 | and dl, [bp + 08h] ; cb
|
---|
| 343 | or cl, dl
|
---|
| 344 | repe scasb
|
---|
| 345 | je .return16_all_same
|
---|
| 346 |
|
---|
| 347 | .return16_di:
|
---|
| 348 | mov ax, di
|
---|
| 349 | dec ax
|
---|
[60095] | 350 | mov dx, es
|
---|
[59747] | 351 | jmp .return16
|
---|
| 352 |
|
---|
| 353 | %endif ; ARCH_BITS == 16
|
---|
| 354 | ENDPROC ASMMemFirstMismatchingU8
|
---|
| 355 |
|
---|