VirtualBox

source: vbox/trunk/include/iprt/asm.h

Last change on this file was 103082, checked in by vboxsync, 4 months ago

fix doxygen complain

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 277.6 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# pragma intrinsic(__cpuid)
60# pragma intrinsic(__stosd)
61# pragma intrinsic(__stosw)
62# pragma intrinsic(__stosb)
63# pragma intrinsic(_BitScanForward)
64# pragma intrinsic(_BitScanReverse)
65# pragma intrinsic(_bittest)
66# pragma intrinsic(_bittestandset)
67# pragma intrinsic(_bittestandreset)
68# pragma intrinsic(_bittestandcomplement)
69# pragma intrinsic(_byteswap_ushort)
70# pragma intrinsic(_byteswap_ulong)
71# pragma intrinsic(_interlockedbittestandset)
72# pragma intrinsic(_interlockedbittestandreset)
73# pragma intrinsic(_InterlockedAnd)
74# pragma intrinsic(_InterlockedOr)
75# pragma intrinsic(_InterlockedXor)
76# pragma intrinsic(_InterlockedIncrement)
77# pragma intrinsic(_InterlockedDecrement)
78# pragma intrinsic(_InterlockedExchange)
79# pragma intrinsic(_InterlockedExchangeAdd)
80# pragma intrinsic(_InterlockedCompareExchange)
81# pragma intrinsic(_InterlockedCompareExchange8)
82# pragma intrinsic(_InterlockedCompareExchange16)
83# pragma intrinsic(_InterlockedCompareExchange64)
84# pragma intrinsic(_rotl)
85# pragma intrinsic(_rotr)
86# pragma intrinsic(_rotl64)
87# pragma intrinsic(_rotr64)
88# ifdef RT_ARCH_AMD64
89# pragma intrinsic(__stosq)
90# pragma intrinsic(_byteswap_uint64)
91# pragma intrinsic(_InterlockedCompareExchange128)
92# pragma intrinsic(_InterlockedExchange64)
93# pragma intrinsic(_InterlockedExchangeAdd64)
94# pragma intrinsic(_InterlockedAnd64)
95# pragma intrinsic(_InterlockedOr64)
96# pragma intrinsic(_InterlockedIncrement64)
97# pragma intrinsic(_InterlockedDecrement64)
98# endif
99#endif
100
101#if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
102/** @def RTASM_ARM64_USE_FEAT_LSE
103 * Use instructions from the FEAT_LSE set to implement atomic operations,
104 * assuming that the host CPU always supports these. */
105# define RTASM_ARM64_USE_FEAT_LSE 1
106/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
107 * Set to use DMB w/o barrier in most places and rely on the acquire-release
108 * aspects to do the serializing. The assumption is that the tstRTInline
109 * benchmark may be skewing the results testing an unusual scenario. */
110# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
111#endif
112
113
114/*
115 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
116 */
117#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
118# include "asm-watcom-x86-16.h"
119#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
120# include "asm-watcom-x86-32.h"
121#endif
122
123
124/** @defgroup grp_rt_asm ASM - Assembly Routines
125 * @ingroup grp_rt
126 *
127 * @remarks The difference between ordered and unordered atomic operations are
128 * that the former will complete outstanding reads and writes before
129 * continuing while the latter doesn't make any promises about the
130 * order. Ordered operations doesn't, it seems, make any 100% promise
131 * wrt to whether the operation will complete before any subsequent
132 * memory access. (please, correct if wrong.)
133 *
134 * ASMAtomicSomething operations are all ordered, while
135 * ASMAtomicUoSomething are unordered (note the Uo).
136 *
137 * Please note that ordered operations does not necessarily imply a
138 * compiler (memory) barrier. The user has to use the
139 * ASMCompilerBarrier() macro when that is deemed necessary.
140 *
141 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
142 * to reorder or even optimize assembler instructions away. For
143 * instance, in the following code the second rdmsr instruction is
144 * optimized away because gcc treats that instruction as deterministic:
145 *
146 * @code
147 * static inline uint64_t rdmsr_low(int idx)
148 * {
149 * uint32_t low;
150 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
151 * }
152 * ...
153 * uint32_t msr1 = rdmsr_low(1);
154 * foo(msr1);
155 * msr1 = rdmsr_low(1);
156 * bar(msr1);
157 * @endcode
158 *
159 * The input parameter of rdmsr_low is the same for both calls and
160 * therefore gcc will use the result of the first call as input
161 * parameter for bar() as well. For rdmsr this is not acceptable as
162 * this instruction is _not_ deterministic. This applies to reading
163 * machine status information in general.
164 *
165 * @{
166 */
167
168
169/** @def RT_INLINE_ASM_GCC_4_3_X_X86
170 * Used to work around some 4.3.x register allocation issues in this version of
171 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
172 * definitely not for 5.x */
173#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
174# define RT_INLINE_ASM_GCC_4_3_X_X86 1
175#else
176# define RT_INLINE_ASM_GCC_4_3_X_X86 0
177#endif
178
179/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
180 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
181 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
182 * mode, x86.
183 *
184 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
185 * when in PIC mode on x86.
186 */
187#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
188# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
189# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
190# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
191# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
192# elif ( (defined(PIC) || defined(__PIC__)) \
193 && defined(RT_ARCH_X86) \
194 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
195 || defined(RT_OS_DARWIN)) )
196# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
197# else
198# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
199# endif
200#endif
201
202
203/*
204 * ARM is great fun.
205 */
206#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
207
208# define RTASM_ARM_NO_BARRIER
209# ifdef RT_ARCH_ARM64
210# define RTASM_ARM_NO_BARRIER_IN_REG
211# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
212# define RTASM_ARM_DSB_SY "dsb sy\n\t"
213# define RTASM_ARM_DSB_SY_IN_REG
214# define RTASM_ARM_DSB_SY_COMMA_IN_REG
215# define RTASM_ARM_DMB_SY "dmb sy\n\t"
216# define RTASM_ARM_DMB_SY_IN_REG
217# define RTASM_ARM_DMB_SY_COMMA_IN_REG
218# define RTASM_ARM_DMB_ST "dmb st\n\t"
219# define RTASM_ARM_DMB_ST_IN_REG
220# define RTASM_ARM_DMB_ST_COMMA_IN_REG
221# define RTASM_ARM_DMB_LD "dmb ld\n\t"
222# define RTASM_ARM_DMB_LD_IN_REG
223# define RTASM_ARM_DMB_LD_COMMA_IN_REG
224# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
225# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
226 uint32_t rcSpill; \
227 uint32_t u32NewRet; \
228 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
229 RTASM_ARM_##barrier_type /* before lable? */ \
230 "ldaxr %w[uNew], %[pMem]\n\t" \
231 modify64 \
232 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
233 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
234 : [pMem] "+Q" (*a_pu32Mem) \
235 , [uNew] "=&r" (u32NewRet) \
236 , [rc] "=&r" (rcSpill) \
237 : in_reg \
238 : "cc")
239# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
240 uint32_t rcSpill; \
241 uint32_t u32OldRet; \
242 uint32_t u32NewSpill; \
243 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
244 RTASM_ARM_##barrier_type /* before lable? */ \
245 "ldaxr %w[uOld], %[pMem]\n\t" \
246 modify64 \
247 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
248 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
249 : [pMem] "+Q" (*a_pu32Mem) \
250 , [uOld] "=&r" (u32OldRet) \
251 , [uNew] "=&r" (u32NewSpill) \
252 , [rc] "=&r" (rcSpill) \
253 : in_reg \
254 : "cc")
255# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
256 uint32_t rcSpill; \
257 uint64_t u64NewRet; \
258 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
259 RTASM_ARM_##barrier_type /* before lable? */ \
260 "ldaxr %[uNew], %[pMem]\n\t" \
261 modify64 \
262 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
263 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
264 : [pMem] "+Q" (*a_pu64Mem) \
265 , [uNew] "=&r" (u64NewRet) \
266 , [rc] "=&r" (rcSpill) \
267 : in_reg \
268 : "cc")
269# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
270 uint32_t rcSpill; \
271 uint64_t u64OldRet; \
272 uint64_t u64NewSpill; \
273 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
274 RTASM_ARM_##barrier_type /* before lable? */ \
275 "ldaxr %[uOld], %[pMem]\n\t" \
276 modify64 \
277 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
278 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
279 : [pMem] "+Q" (*a_pu64Mem) \
280 , [uOld] "=&r" (u64OldRet) \
281 , [uNew] "=&r" (u64NewSpill) \
282 , [rc] "=&r" (rcSpill) \
283 : in_reg \
284 : "cc")
285
286# else /* RT_ARCH_ARM32 */
287# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
288# if RT_ARCH_ARM32 >= 7
289# warning armv7
290# define RTASM_ARM_NO_BARRIER_IN_REG
291# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
292# define RTASM_ARM_DSB_SY "dsb sy\n\t"
293# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
294# define RTASM_ARM_DMB_SY "dmb sy\n\t"
295# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
296# define RTASM_ARM_DMB_ST "dmb st\n\t"
297# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
298# define RTASM_ARM_DMB_LD "dmb ld\n\t"
299# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
300
301# elif RT_ARCH_ARM32 >= 6
302# warning armv6
303# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
304# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
305# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
306# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
307# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
308# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
309# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
310# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
311
312# elif RT_ARCH_ARM32 >= 4
313# warning armv5 or older
314# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
315# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
316# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
317# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
318# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
319# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
320# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
321# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
322# else
323# error "huh? Odd RT_ARCH_ARM32 value!"
324# endif
325# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
326# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
327# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
328# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
329# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
330 uint32_t rcSpill; \
331 uint32_t u32NewRet; \
332 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
333 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
334 "ldrex %[uNew], %[pMem]\n\t" \
335 modify32 \
336 "strex %[rc], %[uNew], %[pMem]\n\t" \
337 "cmp %[rc], #0\n\t" \
338 "bne Ltry_again_" #name "_%=\n\t" \
339 : [pMem] "+m" (*a_pu32Mem) \
340 , [uNew] "=&r" (u32NewRet) \
341 , [rc] "=&r" (rcSpill) \
342 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
343 , in_reg \
344 : "cc")
345# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
346 uint32_t rcSpill; \
347 uint32_t u32OldRet; \
348 uint32_t u32NewSpill; \
349 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
350 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
351 "ldrex %[uOld], %[pMem]\n\t" \
352 modify32 \
353 "strex %[rc], %[uNew], %[pMem]\n\t" \
354 "cmp %[rc], #0\n\t" \
355 "bne Ltry_again_" #name "_%=\n\t" \
356 : [pMem] "+m" (*a_pu32Mem) \
357 , [uOld] "=&r" (u32OldRet) \
358 , [uNew] "=&r" (u32NewSpill) \
359 , [rc] "=&r" (rcSpill) \
360 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
361 , in_reg \
362 : "cc")
363# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
364 uint32_t rcSpill; \
365 uint64_t u64NewRet; \
366 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
367 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
368 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
369 modify32 \
370 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
371 "cmp %[rc], #0\n\t" \
372 "bne Ltry_again_" #name "_%=\n\t" \
373 : [pMem] "+m" (*a_pu64Mem), \
374 [uNew] "=&r" (u64NewRet), \
375 [rc] "=&r" (rcSpill) \
376 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
377 , in_reg \
378 : "cc")
379# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
380 uint32_t rcSpill; \
381 uint64_t u64OldRet; \
382 uint64_t u64NewSpill; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
386 modify32 \
387 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu64Mem), \
391 [uOld] "=&r" (u64OldRet), \
392 [uNew] "=&r" (u64NewSpill), \
393 [rc] "=&r" (rcSpill) \
394 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
395 , in_reg \
396 : "cc")
397# endif /* RT_ARCH_ARM32 */
398#endif
399
400
401/** @def ASMReturnAddress
402 * Gets the return address of the current (or calling if you like) function or method.
403 */
404#ifdef _MSC_VER
405# ifdef __cplusplus
406extern "C"
407# endif
408void * _ReturnAddress(void);
409# pragma intrinsic(_ReturnAddress)
410# define ASMReturnAddress() _ReturnAddress()
411#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
412# define ASMReturnAddress() __builtin_return_address(0)
413#elif defined(__WATCOMC__)
414# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
415#else
416# error "Unsupported compiler."
417#endif
418
419
420/**
421 * Compiler memory barrier.
422 *
423 * Ensure that the compiler does not use any cached (register/tmp stack) memory
424 * values or any outstanding writes when returning from this function.
425 *
426 * This function must be used if non-volatile data is modified by a
427 * device or the VMM. Typical cases are port access, MMIO access,
428 * trapping instruction, etc.
429 */
430#if RT_INLINE_ASM_GNU_STYLE
431# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
432#elif RT_INLINE_ASM_USES_INTRIN
433# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
434#elif defined(__WATCOMC__)
435void ASMCompilerBarrier(void);
436#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
437DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
438{
439 __asm
440 {
441 }
442}
443#endif
444
445
446/** @def ASMBreakpoint
447 * Debugger Breakpoint.
448 * @deprecated Use RT_BREAKPOINT instead.
449 * @internal
450 */
451#define ASMBreakpoint() RT_BREAKPOINT()
452
453
454/**
455 * Spinloop hint for platforms that have these, empty function on the other
456 * platforms.
457 *
458 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
459 * spin locks.
460 */
461#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
462RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
463#else
464DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
465{
466# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
467# if RT_INLINE_ASM_GNU_STYLE
468 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
469# else
470 __asm {
471 _emit 0f3h
472 _emit 090h
473 }
474# endif
475
476# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
477 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
478
479# else
480 /* dummy */
481# endif
482}
483#endif
484
485
486/**
487 * Atomically Exchange an unsigned 8-bit value, ordered.
488 *
489 * @returns Current *pu8 value
490 * @param pu8 Pointer to the 8-bit variable to update.
491 * @param u8 The 8-bit value to assign to *pu8.
492 */
493#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
494RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
495#else
496DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
497{
498# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
499# if RT_INLINE_ASM_GNU_STYLE
500 __asm__ __volatile__("xchgb %0, %1\n\t"
501 : "=m" (*pu8)
502 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
503 : "1" (u8)
504 , "m" (*pu8));
505# else
506 __asm
507 {
508# ifdef RT_ARCH_AMD64
509 mov rdx, [pu8]
510 mov al, [u8]
511 xchg [rdx], al
512 mov [u8], al
513# else
514 mov edx, [pu8]
515 mov al, [u8]
516 xchg [edx], al
517 mov [u8], al
518# endif
519 }
520# endif
521 return u8;
522
523# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
524 uint32_t uOld;
525# if defined(RTASM_ARM64_USE_FEAT_LSE)
526 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
527 have the barrier we shouldn't need that, right? Ordering should be taken
528 care of by the DMB. The SWPB is rather cheap (~70% faster). */
529 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
530# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
531 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
532# else
533 RTASM_ARM_DMB_SY
534 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
535# endif
536 : [pMem] "+Q" (*pu8)
537 , [uOld] "=&r" (uOld)
538 : [uNew] "r" ((uint32_t)u8)
539 : );
540# else
541 uint32_t rcSpill;
542 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
543 RTASM_ARM_DMB_SY
544# if defined(RT_ARCH_ARM64)
545 "ldaxrb %w[uOld], %[pMem]\n\t"
546 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
547 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
548# else
549 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
550 "strexb %[rc], %[uNew], %[pMem]\n\t"
551 "cmp %[rc], #0\n\t"
552 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
553# endif
554 : [pMem] "+Q" (*pu8)
555 , [uOld] "=&r" (uOld)
556 , [rc] "=&r" (rcSpill)
557 : [uNew] "r" ((uint32_t)u8)
558 RTASM_ARM_DMB_SY_COMMA_IN_REG
559 : "cc");
560# endif
561 return (uint8_t)uOld;
562
563# else
564# error "Port me"
565# endif
566}
567#endif
568
569
570/**
571 * Atomically Exchange a signed 8-bit value, ordered.
572 *
573 * @returns Current *pu8 value
574 * @param pi8 Pointer to the 8-bit variable to update.
575 * @param i8 The 8-bit value to assign to *pi8.
576 */
577DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
578{
579 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
580}
581
582
583/**
584 * Atomically Exchange a bool value, ordered.
585 *
586 * @returns Current *pf value
587 * @param pf Pointer to the 8-bit variable to update.
588 * @param f The 8-bit value to assign to *pi8.
589 */
590DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
591{
592#ifdef _MSC_VER
593 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
594#else
595 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
596#endif
597}
598
599
600/**
601 * Atomically Exchange an unsigned 16-bit value, ordered.
602 *
603 * @returns Current *pu16 value
604 * @param pu16 Pointer to the 16-bit variable to update.
605 * @param u16 The 16-bit value to assign to *pu16.
606 */
607#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
608RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
609#else
610DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
611{
612# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
613# if RT_INLINE_ASM_GNU_STYLE
614 __asm__ __volatile__("xchgw %0, %1\n\t"
615 : "=m" (*pu16)
616 , "=r" (u16)
617 : "1" (u16)
618 , "m" (*pu16));
619# else
620 __asm
621 {
622# ifdef RT_ARCH_AMD64
623 mov rdx, [pu16]
624 mov ax, [u16]
625 xchg [rdx], ax
626 mov [u16], ax
627# else
628 mov edx, [pu16]
629 mov ax, [u16]
630 xchg [edx], ax
631 mov [u16], ax
632# endif
633 }
634# endif
635 return u16;
636
637# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
638 uint32_t uOld;
639# if defined(RTASM_ARM64_USE_FEAT_LSE)
640 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
641 slower if we remove the barrier. But since we have the barrier we
642 shouldn't need that, right? Ordering should be taken care of by the DMB.
643 The SWPH is rather cheap (~70% faster). */
644 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
645# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
646 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
647# else
648 RTASM_ARM_DMB_SY
649 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
650# endif
651 : [pMem] "+Q" (*pu16)
652 , [uOld] "=&r" (uOld)
653 : [uNew] "r" ((uint32_t)u16)
654 : );
655# else
656 uint32_t rcSpill;
657 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
658 RTASM_ARM_DMB_SY
659# if defined(RT_ARCH_ARM64)
660 "ldaxrh %w[uOld], %[pMem]\n\t"
661 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
662 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
663# else
664 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
665 "strexh %[rc], %[uNew], %[pMem]\n\t"
666 "cmp %[rc], #0\n\t"
667 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
668# endif
669 : [pMem] "+Q" (*pu16)
670 , [uOld] "=&r" (uOld)
671 , [rc] "=&r" (rcSpill)
672 : [uNew] "r" ((uint32_t)u16)
673 RTASM_ARM_DMB_SY_COMMA_IN_REG
674 : "cc");
675# endif
676 return (uint16_t)uOld;
677
678# else
679# error "Port me"
680# endif
681}
682#endif
683
684
685/**
686 * Atomically Exchange a signed 16-bit value, ordered.
687 *
688 * @returns Current *pu16 value
689 * @param pi16 Pointer to the 16-bit variable to update.
690 * @param i16 The 16-bit value to assign to *pi16.
691 */
692DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
693{
694 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
695}
696
697
698/**
699 * Atomically Exchange an unsigned 32-bit value, ordered.
700 *
701 * @returns Current *pu32 value
702 * @param pu32 Pointer to the 32-bit variable to update.
703 * @param u32 The 32-bit value to assign to *pu32.
704 *
705 * @remarks Does not work on 286 and earlier.
706 */
707#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
708RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
709#else
710DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
711{
712# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
713# if RT_INLINE_ASM_GNU_STYLE
714 __asm__ __volatile__("xchgl %0, %1\n\t"
715 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
716 , "=r" (u32)
717 : "1" (u32)
718 , "m" (*pu32));
719
720# elif RT_INLINE_ASM_USES_INTRIN
721 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
722
723# else
724 __asm
725 {
726# ifdef RT_ARCH_AMD64
727 mov rdx, [pu32]
728 mov eax, u32
729 xchg [rdx], eax
730 mov [u32], eax
731# else
732 mov edx, [pu32]
733 mov eax, u32
734 xchg [edx], eax
735 mov [u32], eax
736# endif
737 }
738# endif
739 return u32;
740
741# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
742 uint32_t uOld;
743# if defined(RTASM_ARM64_USE_FEAT_LSE)
744 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
745 slower if we remove the barrier. But since we have the barrier we
746 shouldn't need that, right? Ordering should be taken care of by the DMB.
747 The SWP is rather cheap (~70% faster). */
748 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
749# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
750 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
751# else
752 RTASM_ARM_DMB_SY
753 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
754# endif
755 : [pMem] "+Q" (*pu32)
756 , [uOld] "=&r" (uOld)
757 : [uNew] "r" (u32)
758 : );
759# else
760 uint32_t rcSpill;
761 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
762 RTASM_ARM_DMB_SY
763# if defined(RT_ARCH_ARM64)
764 "ldaxr %w[uOld], %[pMem]\n\t"
765 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
766 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
767# else
768 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
769 "strex %[rc], %[uNew], %[pMem]\n\t"
770 "cmp %[rc], #0\n\t"
771 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
772# endif
773 : [pMem] "+Q" (*pu32)
774 , [uOld] "=&r" (uOld)
775 , [rc] "=&r" (rcSpill)
776 : [uNew] "r" (u32)
777 RTASM_ARM_DMB_SY_COMMA_IN_REG
778 : "cc");
779# endif
780 return uOld;
781
782# else
783# error "Port me"
784# endif
785}
786#endif
787
788
789/**
790 * Atomically Exchange a signed 32-bit value, ordered.
791 *
792 * @returns Current *pu32 value
793 * @param pi32 Pointer to the 32-bit variable to update.
794 * @param i32 The 32-bit value to assign to *pi32.
795 */
796DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
797{
798 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
799}
800
801
802/**
803 * Atomically Exchange an unsigned 64-bit value, ordered.
804 *
805 * @returns Current *pu64 value
806 * @param pu64 Pointer to the 64-bit variable to update.
807 * @param u64 The 64-bit value to assign to *pu64.
808 *
809 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
810 */
811#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
812 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
813RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
814#else
815DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
816{
817# if defined(RT_ARCH_AMD64)
818# if RT_INLINE_ASM_USES_INTRIN
819 return _InterlockedExchange64((__int64 *)pu64, u64);
820
821# elif RT_INLINE_ASM_GNU_STYLE
822 __asm__ __volatile__("xchgq %0, %1\n\t"
823 : "=m" (*pu64)
824 , "=r" (u64)
825 : "1" (u64)
826 , "m" (*pu64));
827 return u64;
828# else
829 __asm
830 {
831 mov rdx, [pu64]
832 mov rax, [u64]
833 xchg [rdx], rax
834 mov [u64], rax
835 }
836 return u64;
837# endif
838
839# elif defined(RT_ARCH_X86)
840# if RT_INLINE_ASM_GNU_STYLE
841# if defined(PIC) || defined(__PIC__)
842 uint32_t u32EBX = (uint32_t)u64;
843 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
844 "xchgl %%ebx, %3\n\t"
845 "1:\n\t"
846 "lock; cmpxchg8b (%5)\n\t"
847 "jnz 1b\n\t"
848 "movl %3, %%ebx\n\t"
849 /*"xchgl %%esi, %5\n\t"*/
850 : "=A" (u64)
851 , "=m" (*pu64)
852 : "0" (*pu64)
853 , "m" ( u32EBX )
854 , "c" ( (uint32_t)(u64 >> 32) )
855 , "S" (pu64)
856 : "cc");
857# else /* !PIC */
858 __asm__ __volatile__("1:\n\t"
859 "lock; cmpxchg8b %1\n\t"
860 "jnz 1b\n\t"
861 : "=A" (u64)
862 , "=m" (*pu64)
863 : "0" (*pu64)
864 , "b" ( (uint32_t)u64 )
865 , "c" ( (uint32_t)(u64 >> 32) )
866 : "cc");
867# endif
868# else
869 __asm
870 {
871 mov ebx, dword ptr [u64]
872 mov ecx, dword ptr [u64 + 4]
873 mov edi, pu64
874 mov eax, dword ptr [edi]
875 mov edx, dword ptr [edi + 4]
876 retry:
877 lock cmpxchg8b [edi]
878 jnz retry
879 mov dword ptr [u64], eax
880 mov dword ptr [u64 + 4], edx
881 }
882# endif
883 return u64;
884
885# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
886 uint64_t uOld;
887# if defined(RTASM_ARM64_USE_FEAT_LSE)
888 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
889 slower if we remove the barrier. But since we have the barrier we
890 shouldn't need that, right? Ordering should be taken care of by the DMB.
891 The SWP is rather cheap (~70% faster). */
892 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
893# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
894 "swpal %[uNew], %[uOld], %[pMem]\n\t"
895# else
896 RTASM_ARM_DMB_SY
897 "swp %[uNew], %[uOld], %[pMem]\n\t"
898# endif
899 : [pMem] "+Q" (*pu64)
900 , [uOld] "=&r" (uOld)
901 : [uNew] "r" (u64)
902 : );
903# else
904 uint32_t rcSpill;
905 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
906 RTASM_ARM_DMB_SY
907# if defined(RT_ARCH_ARM64)
908 "ldaxr %[uOld], %[pMem]\n\t"
909 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
910 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
911# else
912 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
913 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
914 "cmp %[rc], #0\n\t"
915 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
916# endif
917 : [pMem] "+Q" (*pu64)
918 , [uOld] "=&r" (uOld)
919 , [rc] "=&r" (rcSpill)
920 : [uNew] "r" (u64)
921 RTASM_ARM_DMB_SY_COMMA_IN_REG
922 : "cc");
923# endif
924 return uOld;
925
926# else
927# error "Port me"
928# endif
929}
930#endif
931
932
933/**
934 * Atomically Exchange an signed 64-bit value, ordered.
935 *
936 * @returns Current *pi64 value
937 * @param pi64 Pointer to the 64-bit variable to update.
938 * @param i64 The 64-bit value to assign to *pi64.
939 */
940DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
941{
942 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
943}
944
945
946/**
947 * Atomically Exchange a size_t value, ordered.
948 *
949 * @returns Current *ppv value
950 * @param puDst Pointer to the size_t variable to update.
951 * @param uNew The new value to assign to *puDst.
952 */
953DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
954{
955#if ARCH_BITS == 16
956 AssertCompile(sizeof(size_t) == 2);
957 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
958#elif ARCH_BITS == 32
959 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
960#elif ARCH_BITS == 64
961 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
962#else
963# error "ARCH_BITS is bogus"
964#endif
965}
966
967
968/**
969 * Atomically Exchange a pointer value, ordered.
970 *
971 * @returns Current *ppv value
972 * @param ppv Pointer to the pointer variable to update.
973 * @param pv The pointer value to assign to *ppv.
974 */
975DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
976{
977#if ARCH_BITS == 32 || ARCH_BITS == 16
978 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
979#elif ARCH_BITS == 64
980 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
981#else
982# error "ARCH_BITS is bogus"
983#endif
984}
985
986
987/**
988 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
989 *
990 * @returns Current *pv value
991 * @param ppv Pointer to the pointer variable to update.
992 * @param pv The pointer value to assign to *ppv.
993 * @param Type The type of *ppv, sans volatile.
994 */
995#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
996# define ASMAtomicXchgPtrT(ppv, pv, Type) \
997 __extension__ \
998 ({\
999 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1000 Type const pvTypeChecked = (pv); \
1001 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1002 pvTypeCheckedRet; \
1003 })
1004#else
1005# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1006 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1007#endif
1008
1009
1010/**
1011 * Atomically Exchange a raw-mode context pointer value, ordered.
1012 *
1013 * @returns Current *ppv value
1014 * @param ppvRC Pointer to the pointer variable to update.
1015 * @param pvRC The pointer value to assign to *ppv.
1016 */
1017DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1018{
1019 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1020}
1021
1022
1023/**
1024 * Atomically Exchange a ring-0 pointer value, ordered.
1025 *
1026 * @returns Current *ppv value
1027 * @param ppvR0 Pointer to the pointer variable to update.
1028 * @param pvR0 The pointer value to assign to *ppv.
1029 */
1030DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1031{
1032#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1033 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1034#elif R0_ARCH_BITS == 64
1035 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1036#else
1037# error "R0_ARCH_BITS is bogus"
1038#endif
1039}
1040
1041
1042/**
1043 * Atomically Exchange a ring-3 pointer value, ordered.
1044 *
1045 * @returns Current *ppv value
1046 * @param ppvR3 Pointer to the pointer variable to update.
1047 * @param pvR3 The pointer value to assign to *ppv.
1048 */
1049DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1050{
1051#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1052 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1053#elif R3_ARCH_BITS == 64
1054 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1055#else
1056# error "R3_ARCH_BITS is bogus"
1057#endif
1058}
1059
1060
1061/** @def ASMAtomicXchgHandle
1062 * Atomically Exchange a typical IPRT handle value, ordered.
1063 *
1064 * @param ph Pointer to the value to update.
1065 * @param hNew The new value to assigned to *pu.
1066 * @param phRes Where to store the current *ph value.
1067 *
1068 * @remarks This doesn't currently work for all handles (like RTFILE).
1069 */
1070#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1071# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1072 do { \
1073 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1074 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1075 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1076 } while (0)
1077#elif HC_ARCH_BITS == 64
1078# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1079 do { \
1080 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1081 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1082 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1083 } while (0)
1084#else
1085# error HC_ARCH_BITS
1086#endif
1087
1088
1089/**
1090 * Atomically Exchange a value which size might differ
1091 * between platforms or compilers, ordered.
1092 *
1093 * @param pu Pointer to the variable to update.
1094 * @param uNew The value to assign to *pu.
1095 * @todo This is busted as its missing the result argument.
1096 */
1097#define ASMAtomicXchgSize(pu, uNew) \
1098 do { \
1099 switch (sizeof(*(pu))) { \
1100 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1101 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1102 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1103 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1104 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1105 } \
1106 } while (0)
1107
1108/**
1109 * Atomically Exchange a value which size might differ
1110 * between platforms or compilers, ordered.
1111 *
1112 * @param pu Pointer to the variable to update.
1113 * @param uNew The value to assign to *pu.
1114 * @param puRes Where to store the current *pu value.
1115 */
1116#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1117 do { \
1118 switch (sizeof(*(pu))) { \
1119 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1120 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1121 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1122 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1123 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1124 } \
1125 } while (0)
1126
1127
1128
1129/**
1130 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1131 *
1132 * @returns true if xchg was done.
1133 * @returns false if xchg wasn't done.
1134 *
1135 * @param pu8 Pointer to the value to update.
1136 * @param u8New The new value to assigned to *pu8.
1137 * @param u8Old The old value to *pu8 compare with.
1138 *
1139 * @remarks x86: Requires a 486 or later.
1140 * @todo Rename ASMAtomicCmpWriteU8
1141 */
1142#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE
1143RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1144#else
1145DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1146{
1147# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1148 uint8_t u8Ret;
1149 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1150 "setz %1\n\t"
1151 : "=m" (*pu8)
1152 , "=qm" (u8Ret)
1153 , "=a" (u8Old)
1154 : "q" (u8New)
1155 , "2" (u8Old)
1156 , "m" (*pu8)
1157 : "cc");
1158 return (bool)u8Ret;
1159
1160# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1161 union { uint32_t u; bool f; } fXchg;
1162 uint32_t u32Spill;
1163# if defined(RTASM_ARM64_USE_FEAT_LSE)
1164 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1165# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1166 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1167# else
1168 RTASM_ARM_DMB_SY
1169 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1170# endif
1171 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1172 "cset %w[fXchg], eq\n\t"
1173 : [pMem] "+Q" (*pu8)
1174 , [uOldActual] "=&r" (u32Spill)
1175 , [fXchg] "=&r" (fXchg.u)
1176 : [uNew] "r" ((uint32_t)u8New)
1177 , [uOldOrg] "r" ((uint32_t)u8Old)
1178 , "[uOldActual]" ((uint32_t)u8Old)
1179 : "cc");
1180# else
1181 uint32_t rcSpill;
1182 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1183 RTASM_ARM_DMB_SY
1184# if defined(RT_ARCH_ARM64)
1185 "ldaxrb %w[uOld], %[pMem]\n\t"
1186 "cmp %w[uOld], %w[uCmp]\n\t"
1187 "bne 1f\n\t" /* stop here if not equal */
1188 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1189 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1190 "mov %w[fXchg], #1\n\t"
1191 "1:\n\t"
1192 "clrex\n\t"
1193# else
1194 "ldrexb %[uOld], %[pMem]\n\t"
1195 "teq %[uOld], %[uCmp]\n\t"
1196 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1197 "bne 1f\n\t" /* stop here if not equal */
1198 "cmp %[rc], #0\n\t"
1199 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1200 "mov %[fXchg], #1\n\t"
1201 "1:\n\t"
1202 /** @todo clrexne on armv7? */
1203# endif
1204 : [pMem] "+Q" (*pu8)
1205 , [uOld] "=&r" (u32Spill)
1206 , [rc] "=&r" (rcSpill)
1207 , [fXchg] "=&r" (fXchg.u)
1208 : [uCmp] "r" ((uint32_t)u8Old)
1209 , [uNew] "r" ((uint32_t)u8New)
1210 , "[fXchg]" (0)
1211 RTASM_ARM_DMB_SY_COMMA_IN_REG
1212 : "cc");
1213# endif
1214 return fXchg.f;
1215
1216# else
1217# error "Port me"
1218# endif
1219}
1220#endif
1221
1222
1223/**
1224 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1225 *
1226 * @returns true if xchg was done.
1227 * @returns false if xchg wasn't done.
1228 *
1229 * @param pi8 Pointer to the value to update.
1230 * @param i8New The new value to assigned to *pi8.
1231 * @param i8Old The old value to *pi8 compare with.
1232 *
1233 * @remarks x86: Requires a 486 or later.
1234 * @todo Rename ASMAtomicCmpWriteS8
1235 */
1236DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1237{
1238 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1239}
1240
1241
1242/**
1243 * Atomically Compare and Exchange a bool value, ordered.
1244 *
1245 * @returns true if xchg was done.
1246 * @returns false if xchg wasn't done.
1247 *
1248 * @param pf Pointer to the value to update.
1249 * @param fNew The new value to assigned to *pf.
1250 * @param fOld The old value to *pf compare with.
1251 *
1252 * @remarks x86: Requires a 486 or later.
1253 * @todo Rename ASMAtomicCmpWriteBool
1254 */
1255DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1256{
1257 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1258}
1259
1260
1261/**
1262 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1263 *
1264 * @returns true if xchg was done.
1265 * @returns false if xchg wasn't done.
1266 *
1267 * @param pu32 Pointer to the value to update.
1268 * @param u32New The new value to assigned to *pu32.
1269 * @param u32Old The old value to *pu32 compare with.
1270 *
1271 * @remarks x86: Requires a 486 or later.
1272 * @todo Rename ASMAtomicCmpWriteU32
1273 */
1274#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1275RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1276#else
1277DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1278{
1279# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1280# if RT_INLINE_ASM_GNU_STYLE
1281 uint8_t u8Ret;
1282 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1283 "setz %1\n\t"
1284 : "=m" (*pu32)
1285 , "=qm" (u8Ret)
1286 , "=a" (u32Old)
1287 : "r" (u32New)
1288 , "2" (u32Old)
1289 , "m" (*pu32)
1290 : "cc");
1291 return (bool)u8Ret;
1292
1293# elif RT_INLINE_ASM_USES_INTRIN
1294 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1295
1296# else
1297 uint32_t u32Ret;
1298 __asm
1299 {
1300# ifdef RT_ARCH_AMD64
1301 mov rdx, [pu32]
1302# else
1303 mov edx, [pu32]
1304# endif
1305 mov eax, [u32Old]
1306 mov ecx, [u32New]
1307# ifdef RT_ARCH_AMD64
1308 lock cmpxchg [rdx], ecx
1309# else
1310 lock cmpxchg [edx], ecx
1311# endif
1312 setz al
1313 movzx eax, al
1314 mov [u32Ret], eax
1315 }
1316 return !!u32Ret;
1317# endif
1318
1319# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1320 union { uint32_t u; bool f; } fXchg;
1321 uint32_t u32Spill;
1322 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1323 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1324# if defined(RTASM_ARM64_USE_FEAT_LSE)
1325 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1326# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1327 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1328# else
1329 RTASM_ARM_DMB_SY
1330 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1331# endif
1332 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1333 "cset %w[fXchg], eq\n\t"
1334 : [pMem] "+Q" (*pu32)
1335 , [uOldActual] "=&r" (u32Spill)
1336 , [fXchg] "=&r" (fXchg.u)
1337 : [uNew] "r" (u32New)
1338 , [uOldOrg] "r" (u32Old)
1339 , "[uOldActual]" (u32Old)
1340 : "cc");
1341# else
1342 uint32_t rcSpill;
1343 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1344 RTASM_ARM_DMB_SY
1345# if defined(RT_ARCH_ARM64)
1346 "ldaxr %w[uOld], %[pMem]\n\t"
1347 "cmp %w[uOld], %w[uCmp]\n\t"
1348 "bne 1f\n\t" /* stop here if not equal */
1349 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1350 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1351 "mov %w[fXchg], #1\n\t"
1352 "1:\n\t"
1353 "clrex\n\t"
1354# else
1355 "ldrex %[uOld], %[pMem]\n\t"
1356 "teq %[uOld], %[uCmp]\n\t"
1357 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1358 "bne 1f\n\t" /* stop here if not equal */
1359 "cmp %[rc], #0\n\t"
1360 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1361 "mov %[fXchg], #1\n\t"
1362 "1:\n\t"
1363 /** @todo clrexne on armv7? */
1364# endif
1365 : [pMem] "+Q" (*pu32)
1366 , [uOld] "=&r" (u32Spill)
1367 , [rc] "=&r" (rcSpill)
1368 , [fXchg] "=&r" (fXchg.u)
1369 : [uCmp] "r" (u32Old)
1370 , [uNew] "r" (u32New)
1371 , "[fXchg]" (0)
1372 RTASM_ARM_DMB_SY_COMMA_IN_REG
1373 : "cc");
1374# endif
1375 return fXchg.f;
1376
1377# else
1378# error "Port me"
1379# endif
1380}
1381#endif
1382
1383
1384/**
1385 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1386 *
1387 * @returns true if xchg was done.
1388 * @returns false if xchg wasn't done.
1389 *
1390 * @param pi32 Pointer to the value to update.
1391 * @param i32New The new value to assigned to *pi32.
1392 * @param i32Old The old value to *pi32 compare with.
1393 *
1394 * @remarks x86: Requires a 486 or later.
1395 * @todo Rename ASMAtomicCmpWriteS32
1396 */
1397DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1398{
1399 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1400}
1401
1402
1403/**
1404 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1405 *
1406 * @returns true if xchg was done.
1407 * @returns false if xchg wasn't done.
1408 *
1409 * @param pu64 Pointer to the 64-bit variable to update.
1410 * @param u64New The 64-bit value to assign to *pu64.
1411 * @param u64Old The value to compare with.
1412 *
1413 * @remarks x86: Requires a Pentium or later.
1414 * @todo Rename ASMAtomicCmpWriteU64
1415 */
1416#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1417 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1418RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1419#else
1420DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1421{
1422# if RT_INLINE_ASM_USES_INTRIN
1423 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1424
1425# elif defined(RT_ARCH_AMD64)
1426# if RT_INLINE_ASM_GNU_STYLE
1427 uint8_t u8Ret;
1428 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1429 "setz %1\n\t"
1430 : "=m" (*pu64)
1431 , "=qm" (u8Ret)
1432 , "=a" (u64Old)
1433 : "r" (u64New)
1434 , "2" (u64Old)
1435 , "m" (*pu64)
1436 : "cc");
1437 return (bool)u8Ret;
1438# else
1439 bool fRet;
1440 __asm
1441 {
1442 mov rdx, [pu32]
1443 mov rax, [u64Old]
1444 mov rcx, [u64New]
1445 lock cmpxchg [rdx], rcx
1446 setz al
1447 mov [fRet], al
1448 }
1449 return fRet;
1450# endif
1451
1452# elif defined(RT_ARCH_X86)
1453 uint32_t u32Ret;
1454# if RT_INLINE_ASM_GNU_STYLE
1455# if defined(PIC) || defined(__PIC__)
1456 uint32_t u32EBX = (uint32_t)u64New;
1457 uint32_t u32Spill;
1458 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1459 "lock; cmpxchg8b (%6)\n\t"
1460 "setz %%al\n\t"
1461 "movl %4, %%ebx\n\t"
1462 "movzbl %%al, %%eax\n\t"
1463 : "=a" (u32Ret)
1464 , "=d" (u32Spill)
1465# if RT_GNUC_PREREQ(4, 3)
1466 , "+m" (*pu64)
1467# else
1468 , "=m" (*pu64)
1469# endif
1470 : "A" (u64Old)
1471 , "m" ( u32EBX )
1472 , "c" ( (uint32_t)(u64New >> 32) )
1473 , "S" (pu64)
1474 : "cc");
1475# else /* !PIC */
1476 uint32_t u32Spill;
1477 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1478 "setz %%al\n\t"
1479 "movzbl %%al, %%eax\n\t"
1480 : "=a" (u32Ret)
1481 , "=d" (u32Spill)
1482 , "+m" (*pu64)
1483 : "A" (u64Old)
1484 , "b" ( (uint32_t)u64New )
1485 , "c" ( (uint32_t)(u64New >> 32) )
1486 : "cc");
1487# endif
1488 return (bool)u32Ret;
1489# else
1490 __asm
1491 {
1492 mov ebx, dword ptr [u64New]
1493 mov ecx, dword ptr [u64New + 4]
1494 mov edi, [pu64]
1495 mov eax, dword ptr [u64Old]
1496 mov edx, dword ptr [u64Old + 4]
1497 lock cmpxchg8b [edi]
1498 setz al
1499 movzx eax, al
1500 mov dword ptr [u32Ret], eax
1501 }
1502 return !!u32Ret;
1503# endif
1504
1505# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1506 union { uint32_t u; bool f; } fXchg;
1507 uint64_t u64Spill;
1508 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1509 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1510# if defined(RTASM_ARM64_USE_FEAT_LSE)
1511 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1512# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1513 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1514# else
1515 RTASM_ARM_DMB_SY
1516 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1517# endif
1518 "cmp %[uOldActual], %[uOldOrg]\n\t"
1519 "cset %w[fXchg], eq\n\t"
1520 : [pMem] "+Q" (*pu64)
1521 , [uOldActual] "=&r" (u64Spill)
1522 , [fXchg] "=&r" (fXchg.u)
1523 : [uNew] "r" (u64New)
1524 , [uOldOrg] "r" (u64Old)
1525 , "[uOldActual]" (u64Old)
1526 : "cc");
1527# else
1528 uint32_t rcSpill;
1529 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1530 RTASM_ARM_DMB_SY
1531# if defined(RT_ARCH_ARM64)
1532 "ldaxr %[uOld], %[pMem]\n\t"
1533 "cmp %[uOld], %[uCmp]\n\t"
1534 "bne 1f\n\t" /* stop here if not equal */
1535 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1536 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1537 "mov %w[fXchg], #1\n\t"
1538 "1:\n\t"
1539 "clrex\n\t"
1540# else
1541 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1542 "teq %[uOld], %[uCmp]\n\t"
1543 "teqeq %H[uOld], %H[uCmp]\n\t"
1544 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1545 "bne 1f\n\t" /* stop here if not equal */
1546 "cmp %[rc], #0\n\t"
1547 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1548 "mov %[fXchg], #1\n\t"
1549 "1:\n\t"
1550 /** @todo clrexne on armv7? */
1551# endif
1552 : [pMem] "+Q" (*pu64)
1553 , [uOld] "=&r" (u64Spill)
1554 , [rc] "=&r" (rcSpill)
1555 , [fXchg] "=&r" (fXchg.u)
1556 : [uCmp] "r" (u64Old)
1557 , [uNew] "r" (u64New)
1558 , "[fXchg]" (0)
1559 RTASM_ARM_DMB_SY_COMMA_IN_REG
1560 : "cc");
1561# endif
1562 return fXchg.f;
1563
1564# else
1565# error "Port me"
1566# endif
1567}
1568#endif
1569
1570
1571/**
1572 * Atomically Compare and exchange a signed 64-bit value, ordered.
1573 *
1574 * @returns true if xchg was done.
1575 * @returns false if xchg wasn't done.
1576 *
1577 * @param pi64 Pointer to the 64-bit variable to update.
1578 * @param i64 The 64-bit value to assign to *pu64.
1579 * @param i64Old The value to compare with.
1580 *
1581 * @remarks x86: Requires a Pentium or later.
1582 * @todo Rename ASMAtomicCmpWriteS64
1583 */
1584DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1585{
1586 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1587}
1588
1589#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1590
1591/** @def RTASM_HAVE_CMP_WRITE_U128
1592 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1593 * and ASMAtomicCmpWriteExU128() available. */
1594# define RTASM_HAVE_CMP_WRITE_U128 1
1595
1596
1597/**
1598 * Atomically compare and write an unsigned 128-bit value, ordered.
1599 *
1600 * @returns true if write was done.
1601 * @returns false if write wasn't done.
1602 *
1603 * @param pu128 Pointer to the 128-bit variable to update.
1604 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1605 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1606 * @param u64OldHi The high 64-bit of the value to compare with.
1607 * @param u64OldLo The low 64-bit of the value to compare with.
1608 *
1609 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1610 */
1611# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1612DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1613 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1614# else
1615DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1616 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1617{
1618# if RT_INLINE_ASM_USES_INTRIN
1619 __int64 ai64Cmp[2];
1620 ai64Cmp[0] = u64OldLo;
1621 ai64Cmp[1] = u64OldHi;
1622 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1623
1624# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1625 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1626
1627# elif defined(RT_ARCH_AMD64)
1628# if RT_INLINE_ASM_GNU_STYLE
1629 uint64_t u64Ret;
1630 uint64_t u64Spill;
1631 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1632 "setz %%al\n\t"
1633 "movzbl %%al, %%eax\n\t"
1634 : "=a" (u64Ret)
1635 , "=d" (u64Spill)
1636 , "+m" (*pu128)
1637 : "a" (u64OldLo)
1638 , "d" (u64OldHi)
1639 , "b" (u64NewLo)
1640 , "c" (u64NewHi)
1641 : "cc");
1642
1643 return (bool)u64Ret;
1644# else
1645# error "Port me"
1646# endif
1647# else
1648# error "Port me"
1649# endif
1650}
1651# endif
1652
1653
1654/**
1655 * Atomically compare and write an unsigned 128-bit value, ordered.
1656 *
1657 * @returns true if write was done.
1658 * @returns false if write wasn't done.
1659 *
1660 * @param pu128 Pointer to the 128-bit variable to update.
1661 * @param u128New The 128-bit value to assign to *pu128.
1662 * @param u128Old The value to compare with.
1663 *
1664 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1665 */
1666DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1667{
1668# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1669# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1670 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1671# else
1672 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1673 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1674# endif
1675# else
1676 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1677# endif
1678}
1679
1680
1681/**
1682 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1683 */
1684DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1685 const RTUINT128U u128Old) RT_NOTHROW_DEF
1686{
1687# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1688 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1689# else
1690 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1691# endif
1692}
1693
1694#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1695
1696/**
1697 * Atomically Compare and Exchange a pointer value, ordered.
1698 *
1699 * @returns true if xchg was done.
1700 * @returns false if xchg wasn't done.
1701 *
1702 * @param ppv Pointer to the value to update.
1703 * @param pvNew The new value to assigned to *ppv.
1704 * @param pvOld The old value to *ppv compare with.
1705 *
1706 * @remarks x86: Requires a 486 or later.
1707 * @todo Rename ASMAtomicCmpWritePtrVoid
1708 */
1709DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1710{
1711#if ARCH_BITS == 32 || ARCH_BITS == 16
1712 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1713#elif ARCH_BITS == 64
1714 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1715#else
1716# error "ARCH_BITS is bogus"
1717#endif
1718}
1719
1720
1721/**
1722 * Atomically Compare and Exchange a pointer value, ordered.
1723 *
1724 * @returns true if xchg was done.
1725 * @returns false if xchg wasn't done.
1726 *
1727 * @param ppv Pointer to the value to update.
1728 * @param pvNew The new value to assigned to *ppv.
1729 * @param pvOld The old value to *ppv compare with.
1730 *
1731 * @remarks This is relatively type safe on GCC platforms.
1732 * @remarks x86: Requires a 486 or later.
1733 * @todo Rename ASMAtomicCmpWritePtr
1734 */
1735#ifdef __GNUC__
1736# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1737 __extension__ \
1738 ({\
1739 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1740 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1741 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1742 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1743 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1744 fMacroRet; \
1745 })
1746#else
1747# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1748 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1749#endif
1750
1751
1752/** @def ASMAtomicCmpXchgHandle
1753 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1754 *
1755 * @param ph Pointer to the value to update.
1756 * @param hNew The new value to assigned to *pu.
1757 * @param hOld The old value to *pu compare with.
1758 * @param fRc Where to store the result.
1759 *
1760 * @remarks This doesn't currently work for all handles (like RTFILE).
1761 * @remarks x86: Requires a 486 or later.
1762 * @todo Rename ASMAtomicCmpWriteHandle
1763 */
1764#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1765# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1766 do { \
1767 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1768 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1769 } while (0)
1770#elif HC_ARCH_BITS == 64
1771# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1772 do { \
1773 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1774 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1775 } while (0)
1776#else
1777# error HC_ARCH_BITS
1778#endif
1779
1780
1781/** @def ASMAtomicCmpXchgSize
1782 * Atomically Compare and Exchange a value which size might differ
1783 * between platforms or compilers, ordered.
1784 *
1785 * @param pu Pointer to the value to update.
1786 * @param uNew The new value to assigned to *pu.
1787 * @param uOld The old value to *pu compare with.
1788 * @param fRc Where to store the result.
1789 *
1790 * @remarks x86: Requires a 486 or later.
1791 * @todo Rename ASMAtomicCmpWriteSize
1792 */
1793#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1794 do { \
1795 switch (sizeof(*(pu))) { \
1796 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1797 break; \
1798 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1799 break; \
1800 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1801 (fRc) = false; \
1802 break; \
1803 } \
1804 } while (0)
1805
1806
1807/**
1808 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1809 * back old value, ordered.
1810 *
1811 * @returns true if xchg was done.
1812 * @returns false if xchg wasn't done.
1813 *
1814 * @param pu8 Pointer to the value to update.
1815 * @param u8New The new value to assigned to *pu32.
1816 * @param u8Old The old value to *pu8 compare with.
1817 * @param pu8Old Pointer store the old value at.
1818 *
1819 * @remarks x86: Requires a 486 or later.
1820 */
1821#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1822RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1823#else
1824DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1825{
1826# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1827# if RT_INLINE_ASM_GNU_STYLE
1828 uint8_t u8Ret;
1829 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1830 "setz %1\n\t"
1831 : "=m" (*pu8)
1832 , "=qm" (u8Ret)
1833 , "=a" (*pu8Old)
1834# if defined(RT_ARCH_X86)
1835 : "q" (u8New)
1836# else
1837 : "r" (u8New)
1838# endif
1839 , "a" (u8Old)
1840 , "m" (*pu8)
1841 : "cc");
1842 return (bool)u8Ret;
1843
1844# elif RT_INLINE_ASM_USES_INTRIN
1845 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1846
1847# else
1848 uint8_t u8Ret;
1849 __asm
1850 {
1851# ifdef RT_ARCH_AMD64
1852 mov rdx, [pu8]
1853# else
1854 mov edx, [pu8]
1855# endif
1856 mov eax, [u8Old]
1857 mov ecx, [u8New]
1858# ifdef RT_ARCH_AMD64
1859 lock cmpxchg [rdx], ecx
1860 mov rdx, [pu8Old]
1861 mov [rdx], eax
1862# else
1863 lock cmpxchg [edx], ecx
1864 mov edx, [pu8Old]
1865 mov [edx], eax
1866# endif
1867 setz al
1868 movzx eax, al
1869 mov [u8Ret], eax
1870 }
1871 return !!u8Ret;
1872# endif
1873
1874# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1875 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
1876 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
1877# if defined(RTASM_ARM64_USE_FEAT_LSE)
1878 union { uint32_t u; bool f; } fXchg;
1879 uint32_t u32Actual;
1880 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
1881# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1882 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1883# else
1884 RTASM_ARM_DMB_SY
1885 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1886# endif
1887 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1888 "cset %w[fXchg], eq\n\t"
1889 : [pMem] "+Q" (*pu8)
1890 , [uOldActual] "=&r" (u32Actual)
1891 , [fXchg] "=&r" (fXchg.u)
1892 : [uNew] "r" ((uint32_t)u8New)
1893 , [uOldOrg] "r" ((uint32_t)u8Old)
1894 , "[uOldActual]" ((uint32_t)u8Old)
1895 : "cc");
1896 *pu8Old = (uint8_t)u32Actual;
1897# else
1898 union { uint8_t u; bool f; } fXchg;
1899 uint8_t u8ActualOld;
1900 uint8_t rcSpill;
1901 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
1902 RTASM_ARM_DMB_SY
1903# if defined(RT_ARCH_ARM64)
1904 "ldaxrb %w[uOld], %[pMem]\n\t"
1905 "cmp %w[uOld], %w[uCmp]\n\t"
1906 "bne 1f\n\t" /* stop here if not equal */
1907 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1908 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1909 "mov %w[fXchg], #1\n\t"
1910 "1:\n\t"
1911 "clrex\n\t"
1912# else
1913 "ldrexb %[uOld], %[pMem]\n\t"
1914 "teq %[uOld], %[uCmp]\n\t"
1915 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1916 "bne 1f\n\t" /* stop here if not equal */
1917 "cmp %[rc], #0\n\t"
1918 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1919 "mov %[fXchg], #1\n\t"
1920 "1:\n\t"
1921 /** @todo clrexne on armv7? */
1922# endif
1923 : [pMem] "+Q" (*pu8)
1924 , [uOld] "=&r" (u8ActualOld)
1925 , [rc] "=&r" (rcSpill)
1926 , [fXchg] "=&r" (fXchg.u)
1927 : [uCmp] "r" (u8Old)
1928 , [uNew] "r" (u8New)
1929 , "[fXchg]" (0)
1930 RTASM_ARM_DMB_SY_COMMA_IN_REG
1931 : "cc");
1932 *pu8Old = u8ActualOld;
1933# endif
1934 return fXchg.f;
1935
1936# else
1937# error "Port me"
1938# endif
1939}
1940#endif
1941
1942
1943/**
1944 * Atomically Compare and Exchange a signed 8-bit value, additionally
1945 * passes back old value, ordered.
1946 *
1947 * @returns true if xchg was done.
1948 * @returns false if xchg wasn't done.
1949 *
1950 * @param pi8 Pointer to the value to update.
1951 * @param i8New The new value to assigned to *pi8.
1952 * @param i8Old The old value to *pi8 compare with.
1953 * @param pi8Old Pointer store the old value at.
1954 *
1955 * @remarks x86: Requires a 486 or later.
1956 */
1957DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
1958{
1959 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
1960}
1961
1962
1963/**
1964 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
1965 * back old value, ordered.
1966 *
1967 * @returns true if xchg was done.
1968 * @returns false if xchg wasn't done.
1969 *
1970 * @param pu16 Pointer to the value to update.
1971 * @param u16New The new value to assigned to *pu16.
1972 * @param u16Old The old value to *pu32 compare with.
1973 * @param pu16Old Pointer store the old value at.
1974 *
1975 * @remarks x86: Requires a 486 or later.
1976 */
1977#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1978RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
1979#else
1980DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
1981{
1982# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1983# if RT_INLINE_ASM_GNU_STYLE
1984 uint8_t u8Ret;
1985 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
1986 "setz %1\n\t"
1987 : "=m" (*pu16)
1988 , "=qm" (u8Ret)
1989 , "=a" (*pu16Old)
1990 : "r" (u16New)
1991 , "a" (u16Old)
1992 , "m" (*pu16)
1993 : "cc");
1994 return (bool)u8Ret;
1995
1996# elif RT_INLINE_ASM_USES_INTRIN
1997 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
1998
1999# else
2000 uint16_t u16Ret;
2001 __asm
2002 {
2003# ifdef RT_ARCH_AMD64
2004 mov rdx, [pu16]
2005# else
2006 mov edx, [pu16]
2007# endif
2008 mov eax, [u16Old]
2009 mov ecx, [u16New]
2010# ifdef RT_ARCH_AMD64
2011 lock cmpxchg [rdx], ecx
2012 mov rdx, [pu16Old]
2013 mov [rdx], eax
2014# else
2015 lock cmpxchg [edx], ecx
2016 mov edx, [pu16Old]
2017 mov [edx], eax
2018# endif
2019 setz al
2020 movzx eax, al
2021 mov [u16Ret], eax
2022 }
2023 return !!u16Ret;
2024# endif
2025
2026# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2027 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2028 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2029# if defined(RTASM_ARM64_USE_FEAT_LSE)
2030 union { uint32_t u; bool f; } fXchg;
2031 uint32_t u32Actual;
2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2033# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2034 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2035# else
2036 RTASM_ARM_DMB_SY
2037 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2038# endif
2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2040 "cset %w[fXchg], eq\n\t"
2041 : [pMem] "+Q" (*pu16)
2042 , [uOldActual] "=&r" (u32Actual)
2043 , [fXchg] "=&r" (fXchg.u)
2044 : [uNew] "r" ((uint32_t)u16New)
2045 , [uOldOrg] "r" ((uint32_t)u16Old)
2046 , "[uOldActual]" ((uint32_t)u16Old)
2047 : "cc");
2048 *pu16Old = (uint16_t)u32Actual;
2049# else
2050 union { uint16_t u; bool f; } fXchg;
2051 uint16_t u16ActualOld;
2052 uint16_t rcSpill;
2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2054 RTASM_ARM_DMB_SY
2055# if defined(RT_ARCH_ARM64)
2056 "ldaxrh %w[uOld], %[pMem]\n\t"
2057 "cmp %w[uOld], %w[uCmp]\n\t"
2058 "bne 1f\n\t" /* stop here if not equal */
2059 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2060 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2061 "mov %w[fXchg], #1\n\t"
2062 "1:\n\t"
2063 "clrex\n\t"
2064# else
2065 "ldrexh %[uOld], %[pMem]\n\t"
2066 "teq %[uOld], %[uCmp]\n\t"
2067 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2068 "bne 1f\n\t" /* stop here if not equal */
2069 "cmp %[rc], #0\n\t"
2070 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2071 "mov %[fXchg], #1\n\t"
2072 "1:\n\t"
2073 /** @todo clrexne on armv7? */
2074# endif
2075 : [pMem] "+Q" (*pu16)
2076 , [uOld] "=&r" (u16ActualOld)
2077 , [rc] "=&r" (rcSpill)
2078 , [fXchg] "=&r" (fXchg.u)
2079 : [uCmp] "r" (u16Old)
2080 , [uNew] "r" (u16New)
2081 , "[fXchg]" (0)
2082 RTASM_ARM_DMB_SY_COMMA_IN_REG
2083 : "cc");
2084 *pu16Old = u16ActualOld;
2085# endif
2086 return fXchg.f;
2087
2088# else
2089# error "Port me"
2090# endif
2091}
2092#endif
2093
2094
2095/**
2096 * Atomically Compare and Exchange a signed 16-bit value, additionally
2097 * passes back old value, ordered.
2098 *
2099 * @returns true if xchg was done.
2100 * @returns false if xchg wasn't done.
2101 *
2102 * @param pi16 Pointer to the value to update.
2103 * @param i16New The new value to assigned to *pi16.
2104 * @param i16Old The old value to *pi16 compare with.
2105 * @param pi16Old Pointer store the old value at.
2106 *
2107 * @remarks x86: Requires a 486 or later.
2108 */
2109DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2110{
2111 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2112}
2113
2114
2115/**
2116 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2117 * passes back old value, ordered.
2118 *
2119 * @returns true if xchg was done.
2120 * @returns false if xchg wasn't done.
2121 *
2122 * @param pu32 Pointer to the value to update.
2123 * @param u32New The new value to assigned to *pu32.
2124 * @param u32Old The old value to *pu32 compare with.
2125 * @param pu32Old Pointer store the old value at.
2126 *
2127 * @remarks x86: Requires a 486 or later.
2128 */
2129#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2130RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2131#else
2132DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2133{
2134# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2135# if RT_INLINE_ASM_GNU_STYLE
2136 uint8_t u8Ret;
2137 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2138 "setz %1\n\t"
2139 : "=m" (*pu32)
2140 , "=qm" (u8Ret)
2141 , "=a" (*pu32Old)
2142 : "r" (u32New)
2143 , "a" (u32Old)
2144 , "m" (*pu32)
2145 : "cc");
2146 return (bool)u8Ret;
2147
2148# elif RT_INLINE_ASM_USES_INTRIN
2149 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2150
2151# else
2152 uint32_t u32Ret;
2153 __asm
2154 {
2155# ifdef RT_ARCH_AMD64
2156 mov rdx, [pu32]
2157# else
2158 mov edx, [pu32]
2159# endif
2160 mov eax, [u32Old]
2161 mov ecx, [u32New]
2162# ifdef RT_ARCH_AMD64
2163 lock cmpxchg [rdx], ecx
2164 mov rdx, [pu32Old]
2165 mov [rdx], eax
2166# else
2167 lock cmpxchg [edx], ecx
2168 mov edx, [pu32Old]
2169 mov [edx], eax
2170# endif
2171 setz al
2172 movzx eax, al
2173 mov [u32Ret], eax
2174 }
2175 return !!u32Ret;
2176# endif
2177
2178# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2179 union { uint32_t u; bool f; } fXchg;
2180 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2181 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2182# if defined(RTASM_ARM64_USE_FEAT_LSE)
2183 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2184# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2185 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2186# else
2187 RTASM_ARM_DMB_SY
2188 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2189# endif
2190 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2191 "cset %w[fXchg], eq\n\t"
2192 : [pMem] "+Q" (*pu32)
2193 , [uOldActual] "=&r" (*pu32Old)
2194 , [fXchg] "=&r" (fXchg.u)
2195 : [uNew] "r" (u32New)
2196 , [uOldOrg] "r" (u32Old)
2197 , "[uOldActual]" (u32Old)
2198 : "cc");
2199# else
2200 uint32_t u32ActualOld;
2201 uint32_t rcSpill;
2202 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2203 RTASM_ARM_DMB_SY
2204# if defined(RT_ARCH_ARM64)
2205 "ldaxr %w[uOld], %[pMem]\n\t"
2206 "cmp %w[uOld], %w[uCmp]\n\t"
2207 "bne 1f\n\t" /* stop here if not equal */
2208 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2209 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2210 "mov %w[fXchg], #1\n\t"
2211 "1:\n\t"
2212 "clrex\n\t"
2213# else
2214 "ldrex %[uOld], %[pMem]\n\t"
2215 "teq %[uOld], %[uCmp]\n\t"
2216 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2217 "bne 1f\n\t" /* stop here if not equal */
2218 "cmp %[rc], #0\n\t"
2219 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2220 "mov %[fXchg], #1\n\t"
2221 "1:\n\t"
2222 /** @todo clrexne on armv7? */
2223# endif
2224 : [pMem] "+Q" (*pu32)
2225 , [uOld] "=&r" (u32ActualOld)
2226 , [rc] "=&r" (rcSpill)
2227 , [fXchg] "=&r" (fXchg.u)
2228 : [uCmp] "r" (u32Old)
2229 , [uNew] "r" (u32New)
2230 , "[fXchg]" (0)
2231 RTASM_ARM_DMB_SY_COMMA_IN_REG
2232 : "cc");
2233 *pu32Old = u32ActualOld;
2234# endif
2235 return fXchg.f;
2236
2237# else
2238# error "Port me"
2239# endif
2240}
2241#endif
2242
2243
2244/**
2245 * Atomically Compare and Exchange a signed 32-bit value, additionally
2246 * passes back old value, ordered.
2247 *
2248 * @returns true if xchg was done.
2249 * @returns false if xchg wasn't done.
2250 *
2251 * @param pi32 Pointer to the value to update.
2252 * @param i32New The new value to assigned to *pi32.
2253 * @param i32Old The old value to *pi32 compare with.
2254 * @param pi32Old Pointer store the old value at.
2255 *
2256 * @remarks x86: Requires a 486 or later.
2257 */
2258DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2259{
2260 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2261}
2262
2263
2264/**
2265 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2266 * passing back old value, ordered.
2267 *
2268 * @returns true if xchg was done.
2269 * @returns false if xchg wasn't done.
2270 *
2271 * @param pu64 Pointer to the 64-bit variable to update.
2272 * @param u64New The 64-bit value to assign to *pu64.
2273 * @param u64Old The value to compare with.
2274 * @param pu64Old Pointer store the old value at.
2275 *
2276 * @remarks x86: Requires a Pentium or later.
2277 */
2278#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2279 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2280RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2281#else
2282DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2283{
2284# if RT_INLINE_ASM_USES_INTRIN
2285 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2286
2287# elif defined(RT_ARCH_AMD64)
2288# if RT_INLINE_ASM_GNU_STYLE
2289 uint8_t u8Ret;
2290 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2291 "setz %1\n\t"
2292 : "=m" (*pu64)
2293 , "=qm" (u8Ret)
2294 , "=a" (*pu64Old)
2295 : "r" (u64New)
2296 , "a" (u64Old)
2297 , "m" (*pu64)
2298 : "cc");
2299 return (bool)u8Ret;
2300# else
2301 bool fRet;
2302 __asm
2303 {
2304 mov rdx, [pu32]
2305 mov rax, [u64Old]
2306 mov rcx, [u64New]
2307 lock cmpxchg [rdx], rcx
2308 mov rdx, [pu64Old]
2309 mov [rdx], rax
2310 setz al
2311 mov [fRet], al
2312 }
2313 return fRet;
2314# endif
2315
2316# elif defined(RT_ARCH_X86)
2317# if RT_INLINE_ASM_GNU_STYLE
2318 uint64_t u64Ret;
2319# if defined(PIC) || defined(__PIC__)
2320 /* Note #1: This code uses a memory clobber description, because the clean
2321 solution with an output value for *pu64 makes gcc run out of
2322 registers. This will cause suboptimal code, and anyone with a
2323 better solution is welcome to improve this.
2324
2325 Note #2: We must prevent gcc from encoding the memory access, as it
2326 may go via the GOT if we're working on a global variable (like
2327 in the testcase). Thus we request a register (%3) and
2328 dereference it ourselves. */
2329 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2330 "lock; cmpxchg8b (%3)\n\t"
2331 "xchgl %%ebx, %1\n\t"
2332 : "=A" (u64Ret)
2333 : "DS" ((uint32_t)u64New)
2334 , "c" ((uint32_t)(u64New >> 32))
2335 , "r" (pu64) /* Do not use "m" here*/
2336 , "0" (u64Old)
2337 : "memory"
2338 , "cc" );
2339# else /* !PIC */
2340 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2341 : "=A" (u64Ret)
2342 , "=m" (*pu64)
2343 : "b" ((uint32_t)u64New)
2344 , "c" ((uint32_t)(u64New >> 32))
2345 , "m" (*pu64)
2346 , "0" (u64Old)
2347 : "cc");
2348# endif
2349 *pu64Old = u64Ret;
2350 return u64Ret == u64Old;
2351# else
2352 uint32_t u32Ret;
2353 __asm
2354 {
2355 mov ebx, dword ptr [u64New]
2356 mov ecx, dword ptr [u64New + 4]
2357 mov edi, [pu64]
2358 mov eax, dword ptr [u64Old]
2359 mov edx, dword ptr [u64Old + 4]
2360 lock cmpxchg8b [edi]
2361 mov ebx, [pu64Old]
2362 mov [ebx], eax
2363 setz al
2364 movzx eax, al
2365 add ebx, 4
2366 mov [ebx], edx
2367 mov dword ptr [u32Ret], eax
2368 }
2369 return !!u32Ret;
2370# endif
2371
2372# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2373 union { uint32_t u; bool f; } fXchg;
2374 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2375 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2376# if defined(RTASM_ARM64_USE_FEAT_LSE)
2377 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2378# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2379 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2380# else
2381 RTASM_ARM_DMB_SY
2382 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2383# endif
2384 "cmp %[uOldActual], %[uOldOrg]\n\t"
2385 "cset %w[fXchg], eq\n\t"
2386 : [pMem] "+Q" (*pu64)
2387 , [uOldActual] "=&r" (*pu64Old)
2388 , [fXchg] "=&r" (fXchg.u)
2389 : [uNew] "r" (u64New)
2390 , [uOldOrg] "r" (u64Old)
2391 , "[uOldActual]" (u64Old)
2392 : "cc");
2393# else
2394 uint64_t u64ActualOld;
2395 uint32_t rcSpill;
2396 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2397 RTASM_ARM_DMB_SY
2398# if defined(RT_ARCH_ARM64)
2399 "ldaxr %[uOld], %[pMem]\n\t"
2400 "cmp %[uOld], %[uCmp]\n\t"
2401 "bne 1f\n\t" /* stop here if not equal */
2402 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2403 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2404 "mov %w[fXchg], #1\n\t"
2405 "1:\n\t"
2406 "clrex\n\t"
2407# else
2408 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2409 "teq %[uOld], %[uCmp]\n\t"
2410 "teqeq %H[uOld], %H[uCmp]\n\t"
2411 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2412 "bne 1f\n\t" /* stop here if not equal */
2413 "cmp %[rc], #0\n\t"
2414 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2415 "mov %[fXchg], #1\n\t"
2416 "1:\n\t"
2417 /** @todo clrexne on armv7? */
2418# endif
2419 : [pMem] "+Q" (*pu64)
2420 , [uOld] "=&r" (u64ActualOld)
2421 , [rc] "=&r" (rcSpill)
2422 , [fXchg] "=&r" (fXchg.u)
2423 : [uCmp] "r" (u64Old)
2424 , [uNew] "r" (u64New)
2425 , "[fXchg]" (0)
2426 RTASM_ARM_DMB_SY_COMMA_IN_REG
2427 : "cc");
2428 *pu64Old = u64ActualOld;
2429# endif
2430 return fXchg.f;
2431
2432# else
2433# error "Port me"
2434# endif
2435}
2436#endif
2437
2438
2439/**
2440 * Atomically Compare and exchange a signed 64-bit value, additionally
2441 * passing back old value, ordered.
2442 *
2443 * @returns true if xchg was done.
2444 * @returns false if xchg wasn't done.
2445 *
2446 * @param pi64 Pointer to the 64-bit variable to update.
2447 * @param i64 The 64-bit value to assign to *pu64.
2448 * @param i64Old The value to compare with.
2449 * @param pi64Old Pointer store the old value at.
2450 *
2451 * @remarks x86: Requires a Pentium or later.
2452 */
2453DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2454{
2455 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2456}
2457
2458#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2459
2460/** @def RTASM_HAVE_CMP_XCHG_U128
2461 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2462 * and ASMAtomicCmpSwapExU128() available. */
2463# define RTASM_HAVE_CMP_XCHG_U128 1
2464
2465
2466/**
2467 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2468 *
2469 * @returns true if exchange was done.
2470 * @returns false if exchange wasn't done.
2471 *
2472 * @param pu128 Pointer to the 128-bit variable to update.
2473 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2474 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2475 * @param u64OldHi The high 64-bit of the value to compare with.
2476 * @param u64OldLo The low 64-bit of the value to compare with.
2477 * @param pu128Old Where to return the old value.
2478 *
2479 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2480 */
2481# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2482DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2483 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2484# else
2485DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2486 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2487{
2488# if RT_INLINE_ASM_USES_INTRIN
2489 pu128Old->Hi = u64OldHi;
2490 pu128Old->Lo = u64OldLo;
2491 AssertCompileMemberOffset(uint128_t, Lo, 0);
2492 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2493
2494# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2495 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2496 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2497 *pu128Old = uOld;
2498 return uCmp == uOld;
2499
2500# elif defined(RT_ARCH_AMD64)
2501# if RT_INLINE_ASM_GNU_STYLE
2502 uint8_t bRet;
2503 uint64_t u64RetHi, u64RetLo;
2504 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2505 "setz %b0\n\t"
2506 : "=r" (bRet)
2507 , "=a" (u64RetLo)
2508 , "=d" (u64RetHi)
2509 , "+m" (*pu128)
2510 : "a" (u64OldLo)
2511 , "d" (u64OldHi)
2512 , "b" (u64NewLo)
2513 , "c" (u64NewHi)
2514 : "cc");
2515 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2516 return (bool)bRet;
2517# else
2518# error "Port me"
2519# endif
2520# else
2521# error "Port me"
2522# endif
2523}
2524# endif
2525
2526
2527/**
2528 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2529 *
2530 * @returns true if exchange was done.
2531 * @returns false if exchange wasn't done.
2532 *
2533 * @param pu128 Pointer to the 128-bit variable to update.
2534 * @param u128New The 128-bit value to assign to *pu128.
2535 * @param u128Old The value to compare with.
2536 * @param pu128Old Where to return the old value.
2537 *
2538 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2539 */
2540DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2541 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2542{
2543# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2544# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2545 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2546 *pu128Old = uSwapped;
2547 return uSwapped == u128Old;
2548# else
2549 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2550 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2551# endif
2552# else
2553 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2554# endif
2555}
2556
2557
2558/**
2559 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2560 */
2561DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2562 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2563{
2564# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2565 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2566# else
2567 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2568# endif
2569}
2570
2571#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2572
2573
2574
2575/** @def ASMAtomicCmpXchgExHandle
2576 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2577 *
2578 * @param ph Pointer to the value to update.
2579 * @param hNew The new value to assigned to *pu.
2580 * @param hOld The old value to *pu compare with.
2581 * @param fRc Where to store the result.
2582 * @param phOldVal Pointer to where to store the old value.
2583 *
2584 * @remarks This doesn't currently work for all handles (like RTFILE).
2585 */
2586#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2587# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2588 do { \
2589 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2590 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2591 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2592 } while (0)
2593#elif HC_ARCH_BITS == 64
2594# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2595 do { \
2596 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2597 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2598 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2599 } while (0)
2600#else
2601# error HC_ARCH_BITS
2602#endif
2603
2604
2605/** @def ASMAtomicCmpXchgExSize
2606 * Atomically Compare and Exchange a value which size might differ
2607 * between platforms or compilers. Additionally passes back old value.
2608 *
2609 * @param pu Pointer to the value to update.
2610 * @param uNew The new value to assigned to *pu.
2611 * @param uOld The old value to *pu compare with.
2612 * @param fRc Where to store the result.
2613 * @param puOldVal Pointer to where to store the old value.
2614 *
2615 * @remarks x86: Requires a 486 or later.
2616 */
2617#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2618 do { \
2619 switch (sizeof(*(pu))) { \
2620 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2621 break; \
2622 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2623 break; \
2624 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2625 (fRc) = false; \
2626 (uOldVal) = 0; \
2627 break; \
2628 } \
2629 } while (0)
2630
2631
2632/**
2633 * Atomically Compare and Exchange a pointer value, additionally
2634 * passing back old value, ordered.
2635 *
2636 * @returns true if xchg was done.
2637 * @returns false if xchg wasn't done.
2638 *
2639 * @param ppv Pointer to the value to update.
2640 * @param pvNew The new value to assigned to *ppv.
2641 * @param pvOld The old value to *ppv compare with.
2642 * @param ppvOld Pointer store the old value at.
2643 *
2644 * @remarks x86: Requires a 486 or later.
2645 */
2646DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2647 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2648{
2649#if ARCH_BITS == 32 || ARCH_BITS == 16
2650 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2651#elif ARCH_BITS == 64
2652 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2653#else
2654# error "ARCH_BITS is bogus"
2655#endif
2656}
2657
2658
2659/**
2660 * Atomically Compare and Exchange a pointer value, additionally
2661 * passing back old value, ordered.
2662 *
2663 * @returns true if xchg was done.
2664 * @returns false if xchg wasn't done.
2665 *
2666 * @param ppv Pointer to the value to update.
2667 * @param pvNew The new value to assigned to *ppv.
2668 * @param pvOld The old value to *ppv compare with.
2669 * @param ppvOld Pointer store the old value at.
2670 *
2671 * @remarks This is relatively type safe on GCC platforms.
2672 * @remarks x86: Requires a 486 or later.
2673 */
2674#ifdef __GNUC__
2675# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2676 __extension__ \
2677 ({\
2678 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2679 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2680 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2681 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2682 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2683 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2684 (void **)ppvOldTypeChecked); \
2685 fMacroRet; \
2686 })
2687#else
2688# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2689 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2690#endif
2691
2692
2693/**
2694 * Virtualization unfriendly serializing instruction, always exits.
2695 */
2696#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2697RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2698#else
2699DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2700{
2701# if RT_INLINE_ASM_GNU_STYLE
2702 RTCCUINTREG xAX = 0;
2703# ifdef RT_ARCH_AMD64
2704 __asm__ __volatile__ ("cpuid"
2705 : "=a" (xAX)
2706 : "0" (xAX)
2707 : "rbx", "rcx", "rdx", "memory");
2708# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2709 __asm__ __volatile__ ("push %%ebx\n\t"
2710 "cpuid\n\t"
2711 "pop %%ebx\n\t"
2712 : "=a" (xAX)
2713 : "0" (xAX)
2714 : "ecx", "edx", "memory");
2715# else
2716 __asm__ __volatile__ ("cpuid"
2717 : "=a" (xAX)
2718 : "0" (xAX)
2719 : "ebx", "ecx", "edx", "memory");
2720# endif
2721
2722# elif RT_INLINE_ASM_USES_INTRIN
2723 int aInfo[4];
2724 _ReadWriteBarrier();
2725 __cpuid(aInfo, 0);
2726
2727# else
2728 __asm
2729 {
2730 push ebx
2731 xor eax, eax
2732 cpuid
2733 pop ebx
2734 }
2735# endif
2736}
2737#endif
2738
2739/**
2740 * Virtualization friendly serializing instruction, though more expensive.
2741 */
2742#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2743RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2744#else
2745DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2746{
2747# if RT_INLINE_ASM_GNU_STYLE
2748# ifdef RT_ARCH_AMD64
2749 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2750 "subq $128, %%rsp\n\t" /*redzone*/
2751 "mov %%ss, %%eax\n\t"
2752 "pushq %%rax\n\t"
2753 "pushq %%r10\n\t"
2754 "pushfq\n\t"
2755 "movl %%cs, %%eax\n\t"
2756 "pushq %%rax\n\t"
2757 "leaq 1f(%%rip), %%rax\n\t"
2758 "pushq %%rax\n\t"
2759 "iretq\n\t"
2760 "1:\n\t"
2761 ::: "rax", "r10", "memory", "cc");
2762# else
2763 __asm__ __volatile__ ("pushfl\n\t"
2764 "pushl %%cs\n\t"
2765 "pushl $1f\n\t"
2766 "iretl\n\t"
2767 "1:\n\t"
2768 ::: "memory");
2769# endif
2770
2771# else
2772 __asm
2773 {
2774 pushfd
2775 push cs
2776 push la_ret
2777 iretd
2778 la_ret:
2779 }
2780# endif
2781}
2782#endif
2783
2784/**
2785 * Virtualization friendlier serializing instruction, may still cause exits.
2786 */
2787#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2788RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2789#else
2790DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2791{
2792# if RT_INLINE_ASM_GNU_STYLE
2793 /* rdtscp is not supported by ancient linux build VM of course :-( */
2794# ifdef RT_ARCH_AMD64
2795 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2796 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2797# else
2798 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2799 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2800# endif
2801# else
2802# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2803 uint32_t uIgnore;
2804 _ReadWriteBarrier();
2805 (void)__rdtscp(&uIgnore);
2806 (void)uIgnore;
2807# else
2808 __asm
2809 {
2810 rdtscp
2811 }
2812# endif
2813# endif
2814}
2815#endif
2816
2817
2818/**
2819 * Serialize Instruction (both data store and instruction flush).
2820 */
2821#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2822# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2823#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2824# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
2825#elif defined(RT_ARCH_SPARC64)
2826RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
2827#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2828DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
2829{
2830 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
2831}
2832#else
2833# error "Port me"
2834#endif
2835
2836
2837/**
2838 * Memory fence, waits for any pending writes and reads to complete.
2839 * @note No implicit compiler barrier (which is probably stupid).
2840 */
2841DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
2842{
2843#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2844# if RT_INLINE_ASM_GNU_STYLE
2845 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
2846# elif RT_INLINE_ASM_USES_INTRIN
2847 _mm_mfence();
2848# else
2849 __asm
2850 {
2851 _emit 0x0f
2852 _emit 0xae
2853 _emit 0xf0
2854 }
2855# endif
2856#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2857 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
2858#elif ARCH_BITS == 16
2859 uint16_t volatile u16;
2860 ASMAtomicXchgU16(&u16, 0);
2861#else
2862 uint32_t volatile u32;
2863 ASMAtomicXchgU32(&u32, 0);
2864#endif
2865}
2866
2867
2868/**
2869 * Write fence, waits for any pending writes to complete.
2870 * @note No implicit compiler barrier (which is probably stupid).
2871 */
2872DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
2873{
2874#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2875# if RT_INLINE_ASM_GNU_STYLE
2876 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
2877# elif RT_INLINE_ASM_USES_INTRIN
2878 _mm_sfence();
2879# else
2880 __asm
2881 {
2882 _emit 0x0f
2883 _emit 0xae
2884 _emit 0xf8
2885 }
2886# endif
2887#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2888 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
2889#else
2890 ASMMemoryFence();
2891#endif
2892}
2893
2894
2895/**
2896 * Read fence, waits for any pending reads to complete.
2897 * @note No implicit compiler barrier (which is probably stupid).
2898 */
2899DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
2900{
2901#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2902# if RT_INLINE_ASM_GNU_STYLE
2903 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
2904# elif RT_INLINE_ASM_USES_INTRIN
2905 _mm_lfence();
2906# else
2907 __asm
2908 {
2909 _emit 0x0f
2910 _emit 0xae
2911 _emit 0xe8
2912 }
2913# endif
2914#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2915 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
2916#else
2917 ASMMemoryFence();
2918#endif
2919}
2920
2921
2922/**
2923 * Atomically reads an unsigned 8-bit value, ordered.
2924 *
2925 * @returns Current *pu8 value
2926 * @param pu8 Pointer to the 8-bit variable to read.
2927 */
2928DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2929{
2930#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2931 uint32_t u32;
2932# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
2933 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2934 RTASM_ARM_DMB_SY
2935 "casab %w[uDst], wzr, %[pMem]\n\t"
2936 : [uDst] "=&r" (u32)
2937 : [pMem] "Q" (*pu8),
2938 "0" (0)
2939 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2940# else
2941 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2942 RTASM_ARM_DMB_SY
2943# if defined(RT_ARCH_ARM64)
2944# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
2945 "ldurb %w[uDst], %[pMem]\n\t"
2946# else
2947 "ldxrb %w[uDst], %[pMem]\n\t"
2948 "clrex\n\t"
2949# endif
2950# else
2951 "ldrexb %[uDst], %[pMem]\n\t"
2952 /** @todo clrex */
2953# endif
2954 : [uDst] "=&r" (u32)
2955 : [pMem] "Q" (*pu8)
2956 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2957# endif
2958 return (uint8_t)u32;
2959#else
2960 ASMMemoryFence();
2961 return *pu8; /* byte reads are atomic on x86 */
2962#endif
2963}
2964
2965
2966/**
2967 * Atomically reads an unsigned 8-bit value, unordered.
2968 *
2969 * @returns Current *pu8 value
2970 * @param pu8 Pointer to the 8-bit variable to read.
2971 */
2972DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2973{
2974#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2975 uint32_t u32;
2976 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
2977# if defined(RT_ARCH_ARM64)
2978 "ldurb %w[uDst], %[pMem]\n\t"
2979# else
2980 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
2981# endif
2982 : [uDst] "=&r" (u32)
2983 : [pMem] "Q" (*pu8));
2984 return (uint8_t)u32;
2985#else
2986 return *pu8; /* byte reads are atomic on x86 */
2987#endif
2988}
2989
2990
2991/**
2992 * Atomically reads a signed 8-bit value, ordered.
2993 *
2994 * @returns Current *pi8 value
2995 * @param pi8 Pointer to the 8-bit variable to read.
2996 */
2997DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
2998{
2999#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3000 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3001#else
3002 ASMMemoryFence();
3003 return *pi8; /* byte reads are atomic on x86 */
3004#endif
3005}
3006
3007
3008/**
3009 * Atomically reads a signed 8-bit value, unordered.
3010 *
3011 * @returns Current *pi8 value
3012 * @param pi8 Pointer to the 8-bit variable to read.
3013 */
3014DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3015{
3016#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3017 int32_t i32;
3018 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3019# if defined(RT_ARCH_ARM64)
3020 "ldurb %w[iDst], %[pMem]\n\t"
3021# else
3022 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3023# endif
3024 : [iDst] "=&r" (i32)
3025 : [pMem] "Q" (*pi8));
3026 return (int8_t)i32;
3027#else
3028 return *pi8; /* byte reads are atomic on x86 */
3029#endif
3030}
3031
3032
3033/**
3034 * Atomically reads an unsigned 16-bit value, ordered.
3035 *
3036 * @returns Current *pu16 value
3037 * @param pu16 Pointer to the 16-bit variable to read.
3038 */
3039DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3040{
3041 Assert(!((uintptr_t)pu16 & 1));
3042#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3043 uint32_t u32;
3044# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3045 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3046 RTASM_ARM_DMB_SY
3047 "casah %w[uDst], wzr, %[pMem]\n\t"
3048 : [uDst] "=&r" (u32)
3049 : [pMem] "Q" (*pu16),
3050 "0" (0)
3051 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3052# else
3053 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3054 RTASM_ARM_DMB_SY
3055# if defined(RT_ARCH_ARM64)
3056# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3057 "ldurh %w[uDst], %[pMem]\n\t"
3058# else
3059 "ldxrh %w[uDst], %[pMem]\n\t"
3060 "clrex\n\t"
3061# endif
3062# else
3063 "ldrexh %[uDst], %[pMem]\n\t"
3064 /** @todo clrex */
3065# endif
3066 : [uDst] "=&r" (u32)
3067 : [pMem] "Q" (*pu16)
3068 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3069# endif
3070 return (uint16_t)u32;
3071#else
3072 ASMMemoryFence();
3073 return *pu16;
3074#endif
3075}
3076
3077
3078/**
3079 * Atomically reads an unsigned 16-bit value, unordered.
3080 *
3081 * @returns Current *pu16 value
3082 * @param pu16 Pointer to the 16-bit variable to read.
3083 */
3084DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3085{
3086 Assert(!((uintptr_t)pu16 & 1));
3087#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3088 uint32_t u32;
3089 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3090# if defined(RT_ARCH_ARM64)
3091 "ldurh %w[uDst], %[pMem]\n\t"
3092# else
3093 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3094# endif
3095 : [uDst] "=&r" (u32)
3096 : [pMem] "Q" (*pu16));
3097 return (uint16_t)u32;
3098#else
3099 return *pu16;
3100#endif
3101}
3102
3103
3104/**
3105 * Atomically reads a signed 16-bit value, ordered.
3106 *
3107 * @returns Current *pi16 value
3108 * @param pi16 Pointer to the 16-bit variable to read.
3109 */
3110DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3111{
3112 Assert(!((uintptr_t)pi16 & 1));
3113#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3114 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3115#else
3116 ASMMemoryFence();
3117 return *pi16;
3118#endif
3119}
3120
3121
3122/**
3123 * Atomically reads a signed 16-bit value, unordered.
3124 *
3125 * @returns Current *pi16 value
3126 * @param pi16 Pointer to the 16-bit variable to read.
3127 */
3128DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3129{
3130 Assert(!((uintptr_t)pi16 & 1));
3131#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3132 int32_t i32;
3133 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3134# if defined(RT_ARCH_ARM64)
3135 "ldurh %w[iDst], %[pMem]\n\t"
3136# else
3137 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3138# endif
3139 : [iDst] "=&r" (i32)
3140 : [pMem] "Q" (*pi16));
3141 return (int16_t)i32;
3142#else
3143 return *pi16;
3144#endif
3145}
3146
3147
3148/**
3149 * Atomically reads an unsigned 32-bit value, ordered.
3150 *
3151 * @returns Current *pu32 value
3152 * @param pu32 Pointer to the 32-bit variable to read.
3153 */
3154DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3155{
3156 Assert(!((uintptr_t)pu32 & 3));
3157#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3158 uint32_t u32;
3159# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3160 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3161 RTASM_ARM_DMB_SY
3162 "casa %w[uDst], wzr, %[pMem]\n\t"
3163 : [uDst] "=&r" (u32)
3164 : [pMem] "Q" (*pu32),
3165 "0" (0)
3166 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3167# else
3168 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3169 RTASM_ARM_DMB_SY
3170# if defined(RT_ARCH_ARM64)
3171# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3172 "ldur %w[uDst], %[pMem]\n\t"
3173# else
3174 "ldxr %w[uDst], %[pMem]\n\t"
3175 "clrex\n\t"
3176# endif
3177# else
3178 "ldrex %[uDst], %[pMem]\n\t"
3179 /** @todo clrex */
3180# endif
3181 : [uDst] "=&r" (u32)
3182 : [pMem] "Q" (*pu32)
3183 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3184# endif
3185 return u32;
3186#else
3187 ASMMemoryFence();
3188# if ARCH_BITS == 16
3189 AssertFailed(); /** @todo 16-bit */
3190# endif
3191 return *pu32;
3192#endif
3193}
3194
3195
3196/**
3197 * Atomically reads an unsigned 32-bit value, unordered.
3198 *
3199 * @returns Current *pu32 value
3200 * @param pu32 Pointer to the 32-bit variable to read.
3201 */
3202DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3203{
3204 Assert(!((uintptr_t)pu32 & 3));
3205#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3206 uint32_t u32;
3207 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3208# if defined(RT_ARCH_ARM64)
3209 "ldur %w[uDst], %[pMem]\n\t"
3210# else
3211 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3212# endif
3213 : [uDst] "=&r" (u32)
3214 : [pMem] "Q" (*pu32));
3215 return u32;
3216#else
3217# if ARCH_BITS == 16
3218 AssertFailed(); /** @todo 16-bit */
3219# endif
3220 return *pu32;
3221#endif
3222}
3223
3224
3225/**
3226 * Atomically reads a signed 32-bit value, ordered.
3227 *
3228 * @returns Current *pi32 value
3229 * @param pi32 Pointer to the 32-bit variable to read.
3230 */
3231DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3232{
3233 Assert(!((uintptr_t)pi32 & 3));
3234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3235 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3236#else
3237 ASMMemoryFence();
3238# if ARCH_BITS == 16
3239 AssertFailed(); /** @todo 16-bit */
3240# endif
3241 return *pi32;
3242#endif
3243}
3244
3245
3246/**
3247 * Atomically reads a signed 32-bit value, unordered.
3248 *
3249 * @returns Current *pi32 value
3250 * @param pi32 Pointer to the 32-bit variable to read.
3251 */
3252DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3253{
3254 Assert(!((uintptr_t)pi32 & 3));
3255#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3256 int32_t i32;
3257 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3258# if defined(RT_ARCH_ARM64)
3259 "ldur %w[iDst], %[pMem]\n\t"
3260# else
3261 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3262# endif
3263 : [iDst] "=&r" (i32)
3264 : [pMem] "Q" (*pi32));
3265 return i32;
3266
3267#else
3268# if ARCH_BITS == 16
3269 AssertFailed(); /** @todo 16-bit */
3270# endif
3271 return *pi32;
3272#endif
3273}
3274
3275
3276/**
3277 * Atomically reads an unsigned 64-bit value, ordered.
3278 *
3279 * @returns Current *pu64 value
3280 * @param pu64 Pointer to the 64-bit variable to read.
3281 * The memory pointed to must be writable.
3282 *
3283 * @remarks This may fault if the memory is read-only!
3284 * @remarks x86: Requires a Pentium or later.
3285 */
3286#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3287 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3288RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3289#else
3290DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3291{
3292 uint64_t u64;
3293# ifdef RT_ARCH_AMD64
3294 Assert(!((uintptr_t)pu64 & 7));
3295/*# if RT_INLINE_ASM_GNU_STYLE
3296 __asm__ __volatile__( "mfence\n\t"
3297 "movq %1, %0\n\t"
3298 : "=r" (u64)
3299 : "m" (*pu64));
3300# else
3301 __asm
3302 {
3303 mfence
3304 mov rdx, [pu64]
3305 mov rax, [rdx]
3306 mov [u64], rax
3307 }
3308# endif*/
3309 ASMMemoryFence();
3310 u64 = *pu64;
3311
3312# elif defined(RT_ARCH_X86)
3313# if RT_INLINE_ASM_GNU_STYLE
3314# if defined(PIC) || defined(__PIC__)
3315 uint32_t u32EBX = 0;
3316 Assert(!((uintptr_t)pu64 & 7));
3317 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3318 "lock; cmpxchg8b (%5)\n\t"
3319 "movl %3, %%ebx\n\t"
3320 : "=A" (u64)
3321# if RT_GNUC_PREREQ(4, 3)
3322 , "+m" (*pu64)
3323# else
3324 , "=m" (*pu64)
3325# endif
3326 : "0" (0ULL)
3327 , "m" (u32EBX)
3328 , "c" (0)
3329 , "S" (pu64)
3330 : "cc");
3331# else /* !PIC */
3332 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3333 : "=A" (u64)
3334 , "+m" (*pu64)
3335 : "0" (0ULL)
3336 , "b" (0)
3337 , "c" (0)
3338 : "cc");
3339# endif
3340# else
3341 Assert(!((uintptr_t)pu64 & 7));
3342 __asm
3343 {
3344 xor eax, eax
3345 xor edx, edx
3346 mov edi, pu64
3347 xor ecx, ecx
3348 xor ebx, ebx
3349 lock cmpxchg8b [edi]
3350 mov dword ptr [u64], eax
3351 mov dword ptr [u64 + 4], edx
3352 }
3353# endif
3354
3355# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3356 Assert(!((uintptr_t)pu64 & 7));
3357
3358# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3359 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3360 RTASM_ARM_DMB_SY
3361 "casa %[uDst], xzr, %[pMem]\n\t"
3362 : [uDst] "=&r" (u64)
3363 : [pMem] "Q" (*pu64),
3364 "0" (0)
3365 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3366# else
3367 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3368 RTASM_ARM_DMB_SY
3369# if defined(RT_ARCH_ARM64)
3370# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3371 "ldur %[uDst], %[pMem]\n\t"
3372# else
3373 "ldxr %[uDst], %[pMem]\n\t"
3374 "clrex\n\t"
3375# endif
3376# else
3377 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3378 /** @todo clrex */
3379# endif
3380 : [uDst] "=&r" (u64)
3381 : [pMem] "Q" (*pu64)
3382 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3383# endif
3384# else
3385# error "Port me"
3386# endif
3387 return u64;
3388}
3389#endif
3390
3391
3392/**
3393 * Atomically reads an unsigned 64-bit value, unordered.
3394 *
3395 * @returns Current *pu64 value
3396 * @param pu64 Pointer to the 64-bit variable to read.
3397 * The memory pointed to must be writable.
3398 *
3399 * @remarks This may fault if the memory is read-only!
3400 * @remarks x86: Requires a Pentium or later.
3401 */
3402#if !defined(RT_ARCH_AMD64) \
3403 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3404 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3405RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3406#else
3407DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3408{
3409 uint64_t u64;
3410# ifdef RT_ARCH_AMD64
3411 Assert(!((uintptr_t)pu64 & 7));
3412/*# if RT_INLINE_ASM_GNU_STYLE
3413 Assert(!((uintptr_t)pu64 & 7));
3414 __asm__ __volatile__("movq %1, %0\n\t"
3415 : "=r" (u64)
3416 : "m" (*pu64));
3417# else
3418 __asm
3419 {
3420 mov rdx, [pu64]
3421 mov rax, [rdx]
3422 mov [u64], rax
3423 }
3424# endif */
3425 u64 = *pu64;
3426
3427# elif defined(RT_ARCH_X86)
3428# if RT_INLINE_ASM_GNU_STYLE
3429# if defined(PIC) || defined(__PIC__)
3430 uint32_t u32EBX = 0;
3431 uint32_t u32Spill;
3432 Assert(!((uintptr_t)pu64 & 7));
3433 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3434 "xor %%ecx,%%ecx\n\t"
3435 "xor %%edx,%%edx\n\t"
3436 "xchgl %%ebx, %3\n\t"
3437 "lock; cmpxchg8b (%4)\n\t"
3438 "movl %3, %%ebx\n\t"
3439 : "=A" (u64)
3440# if RT_GNUC_PREREQ(4, 3)
3441 , "+m" (*pu64)
3442# else
3443 , "=m" (*pu64)
3444# endif
3445 , "=c" (u32Spill)
3446 : "m" (u32EBX)
3447 , "S" (pu64)
3448 : "cc");
3449# else /* !PIC */
3450 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3451 : "=A" (u64)
3452 , "+m" (*pu64)
3453 : "0" (0ULL)
3454 , "b" (0)
3455 , "c" (0)
3456 : "cc");
3457# endif
3458# else
3459 Assert(!((uintptr_t)pu64 & 7));
3460 __asm
3461 {
3462 xor eax, eax
3463 xor edx, edx
3464 mov edi, pu64
3465 xor ecx, ecx
3466 xor ebx, ebx
3467 lock cmpxchg8b [edi]
3468 mov dword ptr [u64], eax
3469 mov dword ptr [u64 + 4], edx
3470 }
3471# endif
3472
3473# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3474 Assert(!((uintptr_t)pu64 & 7));
3475 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3476# if defined(RT_ARCH_ARM64)
3477 "ldur %[uDst], %[pMem]\n\t"
3478# else
3479 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3480 /** @todo clrex? */
3481# endif
3482 : [uDst] "=&r" (u64)
3483 : [pMem] "Q" (*pu64));
3484
3485# else
3486# error "Port me"
3487# endif
3488 return u64;
3489}
3490#endif
3491
3492
3493/**
3494 * Atomically reads a signed 64-bit value, ordered.
3495 *
3496 * @returns Current *pi64 value
3497 * @param pi64 Pointer to the 64-bit variable to read.
3498 * The memory pointed to must be writable.
3499 *
3500 * @remarks This may fault if the memory is read-only!
3501 * @remarks x86: Requires a Pentium or later.
3502 */
3503DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3504{
3505 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3506}
3507
3508
3509/**
3510 * Atomically reads a signed 64-bit value, unordered.
3511 *
3512 * @returns Current *pi64 value
3513 * @param pi64 Pointer to the 64-bit variable to read.
3514 * The memory pointed to must be writable.
3515 *
3516 * @remarks This will fault if the memory is read-only!
3517 * @remarks x86: Requires a Pentium or later.
3518 */
3519DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3520{
3521 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3522}
3523
3524
3525/**
3526 * Atomically reads a size_t value, ordered.
3527 *
3528 * @returns Current *pcb value
3529 * @param pcb Pointer to the size_t variable to read.
3530 */
3531DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3532{
3533#if ARCH_BITS == 64
3534 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3535#elif ARCH_BITS == 32
3536 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3537#elif ARCH_BITS == 16
3538 AssertCompileSize(size_t, 2);
3539 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3540#else
3541# error "Unsupported ARCH_BITS value"
3542#endif
3543}
3544
3545
3546/**
3547 * Atomically reads a size_t value, unordered.
3548 *
3549 * @returns Current *pcb value
3550 * @param pcb Pointer to the size_t variable to read.
3551 */
3552DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3553{
3554#if ARCH_BITS == 64 || ARCH_BITS == 16
3555 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3556#elif ARCH_BITS == 32
3557 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3558#elif ARCH_BITS == 16
3559 AssertCompileSize(size_t, 2);
3560 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3561#else
3562# error "Unsupported ARCH_BITS value"
3563#endif
3564}
3565
3566
3567/**
3568 * Atomically reads a pointer value, ordered.
3569 *
3570 * @returns Current *pv value
3571 * @param ppv Pointer to the pointer variable to read.
3572 *
3573 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3574 * requires less typing (no casts).
3575 */
3576DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3577{
3578#if ARCH_BITS == 32 || ARCH_BITS == 16
3579 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3580#elif ARCH_BITS == 64
3581 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3582#else
3583# error "ARCH_BITS is bogus"
3584#endif
3585}
3586
3587/**
3588 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3589 *
3590 * @returns Current *pv value
3591 * @param ppv Pointer to the pointer variable to read.
3592 * @param Type The type of *ppv, sans volatile.
3593 */
3594#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3595# define ASMAtomicReadPtrT(ppv, Type) \
3596 __extension__ \
3597 ({\
3598 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
3599 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
3600 pvTypeChecked; \
3601 })
3602#else
3603# define ASMAtomicReadPtrT(ppv, Type) \
3604 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3605#endif
3606
3607
3608/**
3609 * Atomically reads a pointer value, unordered.
3610 *
3611 * @returns Current *pv value
3612 * @param ppv Pointer to the pointer variable to read.
3613 *
3614 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
3615 * requires less typing (no casts).
3616 */
3617DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3618{
3619#if ARCH_BITS == 32 || ARCH_BITS == 16
3620 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3621#elif ARCH_BITS == 64
3622 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3623#else
3624# error "ARCH_BITS is bogus"
3625#endif
3626}
3627
3628
3629/**
3630 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
3631 *
3632 * @returns Current *pv value
3633 * @param ppv Pointer to the pointer variable to read.
3634 * @param Type The type of *ppv, sans volatile.
3635 */
3636#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3637# define ASMAtomicUoReadPtrT(ppv, Type) \
3638 __extension__ \
3639 ({\
3640 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
3641 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
3642 pvTypeChecked; \
3643 })
3644#else
3645# define ASMAtomicUoReadPtrT(ppv, Type) \
3646 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3647#endif
3648
3649
3650/**
3651 * Atomically reads a boolean value, ordered.
3652 *
3653 * @returns Current *pf value
3654 * @param pf Pointer to the boolean variable to read.
3655 */
3656DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3657{
3658 ASMMemoryFence();
3659 return *pf; /* byte reads are atomic on x86 */
3660}
3661
3662
3663/**
3664 * Atomically reads a boolean value, unordered.
3665 *
3666 * @returns Current *pf value
3667 * @param pf Pointer to the boolean variable to read.
3668 */
3669DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3670{
3671 return *pf; /* byte reads are atomic on x86 */
3672}
3673
3674
3675/**
3676 * Atomically read a typical IPRT handle value, ordered.
3677 *
3678 * @param ph Pointer to the handle variable to read.
3679 * @param phRes Where to store the result.
3680 *
3681 * @remarks This doesn't currently work for all handles (like RTFILE).
3682 */
3683#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3684# define ASMAtomicReadHandle(ph, phRes) \
3685 do { \
3686 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3687 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3688 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
3689 } while (0)
3690#elif HC_ARCH_BITS == 64
3691# define ASMAtomicReadHandle(ph, phRes) \
3692 do { \
3693 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3694 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3695 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
3696 } while (0)
3697#else
3698# error HC_ARCH_BITS
3699#endif
3700
3701
3702/**
3703 * Atomically read a typical IPRT handle value, unordered.
3704 *
3705 * @param ph Pointer to the handle variable to read.
3706 * @param phRes Where to store the result.
3707 *
3708 * @remarks This doesn't currently work for all handles (like RTFILE).
3709 */
3710#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3711# define ASMAtomicUoReadHandle(ph, phRes) \
3712 do { \
3713 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3714 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3715 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
3716 } while (0)
3717#elif HC_ARCH_BITS == 64
3718# define ASMAtomicUoReadHandle(ph, phRes) \
3719 do { \
3720 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3721 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3722 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
3723 } while (0)
3724#else
3725# error HC_ARCH_BITS
3726#endif
3727
3728
3729/**
3730 * Atomically read a value which size might differ
3731 * between platforms or compilers, ordered.
3732 *
3733 * @param pu Pointer to the variable to read.
3734 * @param puRes Where to store the result.
3735 */
3736#define ASMAtomicReadSize(pu, puRes) \
3737 do { \
3738 switch (sizeof(*(pu))) { \
3739 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3740 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3741 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3742 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3743 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3744 } \
3745 } while (0)
3746
3747
3748/**
3749 * Atomically read a value which size might differ
3750 * between platforms or compilers, unordered.
3751 *
3752 * @param pu Pointer to the variable to read.
3753 * @param puRes Where to store the result.
3754 */
3755#define ASMAtomicUoReadSize(pu, puRes) \
3756 do { \
3757 switch (sizeof(*(pu))) { \
3758 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3759 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3760 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3761 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3762 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3763 } \
3764 } while (0)
3765
3766
3767/**
3768 * Atomically writes an unsigned 8-bit value, ordered.
3769 *
3770 * @param pu8 Pointer to the 8-bit variable.
3771 * @param u8 The 8-bit value to assign to *pu8.
3772 */
3773DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3774{
3775#if defined(RT_ARCH_ARM64)
3776 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
3777 as all byte accesses are single-copy atomic, which I think suffices here. */
3778 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
3779# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
3780 RTASM_ARM_DMB_SY
3781 "swpb %w[uValue], wzr, %[pMem]\n\t"
3782# else
3783 RTASM_ARM_DMB_SY
3784 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3785# endif
3786 : [pMem] "+Q" (*pu8)
3787 : [uValue] "r" ((uint32_t)u8)
3788 : );
3789#else
3790 ASMAtomicXchgU8(pu8, u8);
3791#endif
3792}
3793
3794
3795/**
3796 * Atomically writes an unsigned 8-bit value, unordered.
3797 *
3798 * @param pu8 Pointer to the 8-bit variable.
3799 * @param u8 The 8-bit value to assign to *pu8.
3800 */
3801DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3802{
3803 *pu8 = u8; /* byte writes are atomic on x86 */
3804}
3805
3806
3807/**
3808 * Atomically writes a signed 8-bit value, ordered.
3809 *
3810 * @param pi8 Pointer to the 8-bit variable to read.
3811 * @param i8 The 8-bit value to assign to *pi8.
3812 */
3813DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3814{
3815#if defined(RT_ARCH_ARM64)
3816 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
3817#else
3818 ASMAtomicXchgS8(pi8, i8);
3819#endif
3820}
3821
3822
3823/**
3824 * Atomically writes a signed 8-bit value, unordered.
3825 *
3826 * @param pi8 Pointer to the 8-bit variable to write.
3827 * @param i8 The 8-bit value to assign to *pi8.
3828 */
3829DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3830{
3831 *pi8 = i8; /* byte writes are atomic on x86 */
3832}
3833
3834
3835/**
3836 * Atomically writes an unsigned 16-bit value, ordered.
3837 *
3838 * @param pu16 Pointer to the 16-bit variable to write.
3839 * @param u16 The 16-bit value to assign to *pu16.
3840 */
3841DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3842{
3843#if defined(RT_ARCH_ARM64)
3844 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
3845# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3846 RTASM_ARM_DMB_SY
3847 "swph %w[uValue], wzr, %[pMem]\n\t"
3848# else
3849 RTASM_ARM_DMB_SY
3850 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3851# endif
3852 : [pMem] "+Q" (*pu16)
3853 : [uValue] "r" ((uint32_t)u16)
3854 : );
3855#else
3856 ASMAtomicXchgU16(pu16, u16);
3857#endif
3858}
3859
3860
3861/**
3862 * Atomically writes an unsigned 16-bit value, unordered.
3863 *
3864 * @param pu16 Pointer to the 16-bit variable to write.
3865 * @param u16 The 16-bit value to assign to *pu16.
3866 */
3867DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3868{
3869 Assert(!((uintptr_t)pu16 & 1));
3870 *pu16 = u16;
3871}
3872
3873
3874/**
3875 * Atomically writes a signed 16-bit value, ordered.
3876 *
3877 * @param pi16 Pointer to the 16-bit variable to write.
3878 * @param i16 The 16-bit value to assign to *pi16.
3879 */
3880DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3881{
3882#if defined(RT_ARCH_ARM64)
3883 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
3884#else
3885 ASMAtomicXchgS16(pi16, i16);
3886#endif
3887}
3888
3889
3890/**
3891 * Atomically writes a signed 16-bit value, unordered.
3892 *
3893 * @param pi16 Pointer to the 16-bit variable to write.
3894 * @param i16 The 16-bit value to assign to *pi16.
3895 */
3896DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3897{
3898 Assert(!((uintptr_t)pi16 & 1));
3899 *pi16 = i16;
3900}
3901
3902
3903/**
3904 * Atomically writes an unsigned 32-bit value, ordered.
3905 *
3906 * @param pu32 Pointer to the 32-bit variable to write.
3907 * @param u32 The 32-bit value to assign to *pu32.
3908 */
3909DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3910{
3911#if defined(RT_ARCH_ARM64)
3912 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
3913# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3914 RTASM_ARM_DMB_SY
3915 "swp %w[uValue], wzr, %[pMem]\n\t"
3916# else
3917 RTASM_ARM_DMB_SY
3918 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3919# endif
3920 : [pMem] "+Q" (*pu32)
3921 : [uValue] "r" (u32)
3922 : "cc");
3923#else
3924 ASMAtomicXchgU32(pu32, u32);
3925#endif
3926}
3927
3928
3929/**
3930 * Atomically writes an unsigned 32-bit value, unordered.
3931 *
3932 * @param pu32 Pointer to the 32-bit variable to write.
3933 * @param u32 The 32-bit value to assign to *pu32.
3934 */
3935DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3936{
3937 Assert(!((uintptr_t)pu32 & 3));
3938#if ARCH_BITS >= 32
3939 *pu32 = u32;
3940#else
3941 ASMAtomicXchgU32(pu32, u32);
3942#endif
3943}
3944
3945
3946/**
3947 * Atomically writes a signed 32-bit value, ordered.
3948 *
3949 * @param pi32 Pointer to the 32-bit variable to write.
3950 * @param i32 The 32-bit value to assign to *pi32.
3951 */
3952DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3953{
3954#if defined(RT_ARCH_ARM64)
3955 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
3956#else
3957 ASMAtomicXchgS32(pi32, i32);
3958#endif
3959}
3960
3961
3962/**
3963 * Atomically writes a signed 32-bit value, unordered.
3964 *
3965 * @param pi32 Pointer to the 32-bit variable to write.
3966 * @param i32 The 32-bit value to assign to *pi32.
3967 */
3968DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3969{
3970 Assert(!((uintptr_t)pi32 & 3));
3971#if ARCH_BITS >= 32
3972 *pi32 = i32;
3973#else
3974 ASMAtomicXchgS32(pi32, i32);
3975#endif
3976}
3977
3978
3979/**
3980 * Atomically writes an unsigned 64-bit value, ordered.
3981 *
3982 * @param pu64 Pointer to the 64-bit variable to write.
3983 * @param u64 The 64-bit value to assign to *pu64.
3984 */
3985DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
3986{
3987#if defined(RT_ARCH_ARM64)
3988 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
3989# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3990 RTASM_ARM_DMB_SY
3991 "swp %[uValue], xzr, %[pMem]\n\t"
3992# else
3993 RTASM_ARM_DMB_SY /** @todo necessary? */
3994 "stlr %[uValue], %[pMem]\n\t"
3995# endif
3996 : [pMem] "+Q" (*pu64)
3997 : [uValue] "r" (u64)
3998 : );
3999#else
4000 ASMAtomicXchgU64(pu64, u64);
4001#endif
4002}
4003
4004
4005/**
4006 * Atomically writes an unsigned 64-bit value, unordered.
4007 *
4008 * @param pu64 Pointer to the 64-bit variable to write.
4009 * @param u64 The 64-bit value to assign to *pu64.
4010 */
4011DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4012{
4013 Assert(!((uintptr_t)pu64 & 7));
4014#if ARCH_BITS == 64
4015 *pu64 = u64;
4016#else
4017 ASMAtomicXchgU64(pu64, u64);
4018#endif
4019}
4020
4021
4022/**
4023 * Atomically writes a signed 64-bit value, ordered.
4024 *
4025 * @param pi64 Pointer to the 64-bit variable to write.
4026 * @param i64 The 64-bit value to assign to *pi64.
4027 */
4028DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4029{
4030#if defined(RT_ARCH_ARM64)
4031 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4032#else
4033 ASMAtomicXchgS64(pi64, i64);
4034#endif
4035}
4036
4037
4038/**
4039 * Atomically writes a signed 64-bit value, unordered.
4040 *
4041 * @param pi64 Pointer to the 64-bit variable to write.
4042 * @param i64 The 64-bit value to assign to *pi64.
4043 */
4044DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4045{
4046 Assert(!((uintptr_t)pi64 & 7));
4047#if ARCH_BITS == 64
4048 *pi64 = i64;
4049#else
4050 ASMAtomicXchgS64(pi64, i64);
4051#endif
4052}
4053
4054
4055/**
4056 * Atomically writes a size_t value, ordered.
4057 *
4058 * @param pcb Pointer to the size_t variable to write.
4059 * @param cb The value to assign to *pcb.
4060 */
4061DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4062{
4063#if ARCH_BITS == 64
4064 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4065#elif ARCH_BITS == 32
4066 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4067#elif ARCH_BITS == 16
4068 AssertCompileSize(size_t, 2);
4069 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4070#else
4071# error "Unsupported ARCH_BITS value"
4072#endif
4073}
4074
4075
4076/**
4077 * Atomically writes a size_t value, unordered.
4078 *
4079 * @param pcb Pointer to the size_t variable to write.
4080 * @param cb The value to assign to *pcb.
4081 */
4082DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4083{
4084#if ARCH_BITS == 64
4085 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4086#elif ARCH_BITS == 32
4087 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4088#elif ARCH_BITS == 16
4089 AssertCompileSize(size_t, 2);
4090 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4091#else
4092# error "Unsupported ARCH_BITS value"
4093#endif
4094}
4095
4096
4097/**
4098 * Atomically writes a boolean value, unordered.
4099 *
4100 * @param pf Pointer to the boolean variable to write.
4101 * @param f The boolean value to assign to *pf.
4102 */
4103DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4104{
4105 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4106}
4107
4108
4109/**
4110 * Atomically writes a boolean value, unordered.
4111 *
4112 * @param pf Pointer to the boolean variable to write.
4113 * @param f The boolean value to assign to *pf.
4114 */
4115DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4116{
4117 *pf = f; /* byte writes are atomic on x86 */
4118}
4119
4120
4121/**
4122 * Atomically writes a pointer value, ordered.
4123 *
4124 * @param ppv Pointer to the pointer variable to write.
4125 * @param pv The pointer value to assign to *ppv.
4126 */
4127DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4128{
4129#if ARCH_BITS == 32 || ARCH_BITS == 16
4130 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4131#elif ARCH_BITS == 64
4132 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4133#else
4134# error "ARCH_BITS is bogus"
4135#endif
4136}
4137
4138
4139/**
4140 * Atomically writes a pointer value, unordered.
4141 *
4142 * @param ppv Pointer to the pointer variable to write.
4143 * @param pv The pointer value to assign to *ppv.
4144 */
4145DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4146{
4147#if ARCH_BITS == 32 || ARCH_BITS == 16
4148 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4149#elif ARCH_BITS == 64
4150 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4151#else
4152# error "ARCH_BITS is bogus"
4153#endif
4154}
4155
4156
4157/**
4158 * Atomically writes a pointer value, ordered.
4159 *
4160 * @param ppv Pointer to the pointer variable to write.
4161 * @param pv The pointer value to assign to *ppv. If NULL use
4162 * ASMAtomicWriteNullPtr or you'll land in trouble.
4163 *
4164 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4165 * NULL.
4166 */
4167#ifdef __GNUC__
4168# define ASMAtomicWritePtr(ppv, pv) \
4169 do \
4170 { \
4171 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4172 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4173 \
4174 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4175 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4176 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4177 \
4178 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4179 } while (0)
4180#else
4181# define ASMAtomicWritePtr(ppv, pv) \
4182 do \
4183 { \
4184 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4185 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4186 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4187 \
4188 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4189 } while (0)
4190#endif
4191
4192
4193/**
4194 * Atomically sets a pointer to NULL, ordered.
4195 *
4196 * @param ppv Pointer to the pointer variable that should be set to NULL.
4197 *
4198 * @remarks This is relatively type safe on GCC platforms.
4199 */
4200#if RT_GNUC_PREREQ(4, 2)
4201# define ASMAtomicWriteNullPtr(ppv) \
4202 do \
4203 { \
4204 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4205 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4206 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4207 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4208 } while (0)
4209#else
4210# define ASMAtomicWriteNullPtr(ppv) \
4211 do \
4212 { \
4213 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4214 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4215 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4216 } while (0)
4217#endif
4218
4219
4220/**
4221 * Atomically writes a pointer value, unordered.
4222 *
4223 * @returns Current *pv value
4224 * @param ppv Pointer to the pointer variable.
4225 * @param pv The pointer value to assign to *ppv. If NULL use
4226 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4227 *
4228 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4229 * NULL.
4230 */
4231#if RT_GNUC_PREREQ(4, 2)
4232# define ASMAtomicUoWritePtr(ppv, pv) \
4233 do \
4234 { \
4235 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4236 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4237 \
4238 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4239 AssertCompile(sizeof(pv) == sizeof(void *)); \
4240 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4241 \
4242 *(ppvTypeChecked) = pvTypeChecked; \
4243 } while (0)
4244#else
4245# define ASMAtomicUoWritePtr(ppv, pv) \
4246 do \
4247 { \
4248 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4249 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4250 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4251 *(ppv) = pv; \
4252 } while (0)
4253#endif
4254
4255
4256/**
4257 * Atomically sets a pointer to NULL, unordered.
4258 *
4259 * @param ppv Pointer to the pointer variable that should be set to NULL.
4260 *
4261 * @remarks This is relatively type safe on GCC platforms.
4262 */
4263#ifdef __GNUC__
4264# define ASMAtomicUoWriteNullPtr(ppv) \
4265 do \
4266 { \
4267 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4268 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4269 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4270 *(ppvTypeChecked) = NULL; \
4271 } while (0)
4272#else
4273# define ASMAtomicUoWriteNullPtr(ppv) \
4274 do \
4275 { \
4276 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4277 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4278 *(ppv) = NULL; \
4279 } while (0)
4280#endif
4281
4282
4283/**
4284 * Atomically write a typical IPRT handle value, ordered.
4285 *
4286 * @param ph Pointer to the variable to update.
4287 * @param hNew The value to assign to *ph.
4288 *
4289 * @remarks This doesn't currently work for all handles (like RTFILE).
4290 */
4291#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4292# define ASMAtomicWriteHandle(ph, hNew) \
4293 do { \
4294 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4295 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4296 } while (0)
4297#elif HC_ARCH_BITS == 64
4298# define ASMAtomicWriteHandle(ph, hNew) \
4299 do { \
4300 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4301 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4302 } while (0)
4303#else
4304# error HC_ARCH_BITS
4305#endif
4306
4307
4308/**
4309 * Atomically write a typical IPRT handle value, unordered.
4310 *
4311 * @param ph Pointer to the variable to update.
4312 * @param hNew The value to assign to *ph.
4313 *
4314 * @remarks This doesn't currently work for all handles (like RTFILE).
4315 */
4316#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4317# define ASMAtomicUoWriteHandle(ph, hNew) \
4318 do { \
4319 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4320 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4321 } while (0)
4322#elif HC_ARCH_BITS == 64
4323# define ASMAtomicUoWriteHandle(ph, hNew) \
4324 do { \
4325 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4326 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4327 } while (0)
4328#else
4329# error HC_ARCH_BITS
4330#endif
4331
4332
4333/**
4334 * Atomically write a value which size might differ
4335 * between platforms or compilers, ordered.
4336 *
4337 * @param pu Pointer to the variable to update.
4338 * @param uNew The value to assign to *pu.
4339 */
4340#define ASMAtomicWriteSize(pu, uNew) \
4341 do { \
4342 switch (sizeof(*(pu))) { \
4343 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4344 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4345 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4346 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4347 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4348 } \
4349 } while (0)
4350
4351/**
4352 * Atomically write a value which size might differ
4353 * between platforms or compilers, unordered.
4354 *
4355 * @param pu Pointer to the variable to update.
4356 * @param uNew The value to assign to *pu.
4357 */
4358#define ASMAtomicUoWriteSize(pu, uNew) \
4359 do { \
4360 switch (sizeof(*(pu))) { \
4361 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4362 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4363 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4364 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4365 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4366 } \
4367 } while (0)
4368
4369
4370
4371/**
4372 * Atomically exchanges and adds to a 16-bit value, ordered.
4373 *
4374 * @returns The old value.
4375 * @param pu16 Pointer to the value.
4376 * @param u16 Number to add.
4377 *
4378 * @remarks Currently not implemented, just to make 16-bit code happy.
4379 * @remarks x86: Requires a 486 or later.
4380 */
4381RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4382
4383
4384/**
4385 * Atomically exchanges and adds to a 32-bit value, ordered.
4386 *
4387 * @returns The old value.
4388 * @param pu32 Pointer to the value.
4389 * @param u32 Number to add.
4390 *
4391 * @remarks x86: Requires a 486 or later.
4392 */
4393#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4394RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4395#else
4396DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4397{
4398# if RT_INLINE_ASM_USES_INTRIN
4399 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4400 return u32;
4401
4402# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4403# if RT_INLINE_ASM_GNU_STYLE
4404 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4405 : "=r" (u32)
4406 , "=m" (*pu32)
4407 : "0" (u32)
4408 , "m" (*pu32)
4409 : "memory"
4410 , "cc");
4411 return u32;
4412# else
4413 __asm
4414 {
4415 mov eax, [u32]
4416# ifdef RT_ARCH_AMD64
4417 mov rdx, [pu32]
4418 lock xadd [rdx], eax
4419# else
4420 mov edx, [pu32]
4421 lock xadd [edx], eax
4422# endif
4423 mov [u32], eax
4424 }
4425 return u32;
4426# endif
4427
4428# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4429 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
4430# if defined(RTASM_ARM64_USE_FEAT_LSE)
4431 uint32_t u32OldRet;
4432 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
4433# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4434 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4435# else
4436 RTASM_ARM_DMB_SY
4437 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4438# endif
4439 : [pMem] "+Q" (*pu32)
4440 , [uOldActual] "=&r" (u32OldRet)
4441 : [uAddend] "r" (u32)
4442 : );
4443# else
4444 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
4445 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
4446 "add %[uNew], %[uOld], %[uVal]\n\t",
4447 [uVal] "r" (u32));
4448# endif
4449 return u32OldRet;
4450
4451# else
4452# error "Port me"
4453# endif
4454}
4455#endif
4456
4457
4458/**
4459 * Atomically exchanges and adds to a signed 32-bit value, ordered.
4460 *
4461 * @returns The old value.
4462 * @param pi32 Pointer to the value.
4463 * @param i32 Number to add.
4464 *
4465 * @remarks x86: Requires a 486 or later.
4466 */
4467DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4468{
4469 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
4470}
4471
4472
4473/**
4474 * Atomically exchanges and adds to a 64-bit value, ordered.
4475 *
4476 * @returns The old value.
4477 * @param pu64 Pointer to the value.
4478 * @param u64 Number to add.
4479 *
4480 * @remarks x86: Requires a Pentium or later.
4481 */
4482#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4483DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
4484#else
4485DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4486{
4487# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4488 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
4489 return u64;
4490
4491# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4492 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4493 : "=r" (u64)
4494 , "=m" (*pu64)
4495 : "0" (u64)
4496 , "m" (*pu64)
4497 : "memory"
4498 , "cc");
4499 return u64;
4500
4501# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4502# if defined(RTASM_ARM64_USE_FEAT_LSE)
4503 uint64_t u64OldRet;
4504 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
4505# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4506 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
4507# else
4508 RTASM_ARM_DMB_SY
4509 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
4510# endif
4511 : [pMem] "+Q" (*pu64)
4512 , [uOldActual] "=&r" (u64OldRet)
4513 : [uAddend] "r" (u64)
4514 : );
4515# else
4516 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
4517 "add %[uNew], %[uOld], %[uVal]\n\t"
4518 ,
4519 "add %[uNew], %[uOld], %[uVal]\n\t"
4520 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
4521 [uVal] "r" (u64));
4522# endif
4523 return u64OldRet;
4524
4525# else
4526 uint64_t u64Old;
4527 for (;;)
4528 {
4529 uint64_t u64New;
4530 u64Old = ASMAtomicUoReadU64(pu64);
4531 u64New = u64Old + u64;
4532 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
4533 break;
4534 ASMNopPause();
4535 }
4536 return u64Old;
4537# endif
4538}
4539#endif
4540
4541
4542/**
4543 * Atomically exchanges and adds to a signed 64-bit value, ordered.
4544 *
4545 * @returns The old value.
4546 * @param pi64 Pointer to the value.
4547 * @param i64 Number to add.
4548 *
4549 * @remarks x86: Requires a Pentium or later.
4550 */
4551DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4552{
4553 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
4554}
4555
4556
4557/**
4558 * Atomically exchanges and adds to a size_t value, ordered.
4559 *
4560 * @returns The old value.
4561 * @param pcb Pointer to the size_t value.
4562 * @param cb Number to add.
4563 */
4564DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4565{
4566#if ARCH_BITS == 64
4567 AssertCompileSize(size_t, 8);
4568 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
4569#elif ARCH_BITS == 32
4570 AssertCompileSize(size_t, 4);
4571 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
4572#elif ARCH_BITS == 16
4573 AssertCompileSize(size_t, 2);
4574 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
4575#else
4576# error "Unsupported ARCH_BITS value"
4577#endif
4578}
4579
4580
4581/**
4582 * Atomically exchanges and adds a value which size might differ between
4583 * platforms or compilers, ordered.
4584 *
4585 * @param pu Pointer to the variable to update.
4586 * @param uNew The value to add to *pu.
4587 * @param puOld Where to store the old value.
4588 */
4589#define ASMAtomicAddSize(pu, uNew, puOld) \
4590 do { \
4591 switch (sizeof(*(pu))) { \
4592 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4593 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4594 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
4595 } \
4596 } while (0)
4597
4598
4599
4600/**
4601 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
4602 *
4603 * @returns The old value.
4604 * @param pu16 Pointer to the value.
4605 * @param u16 Number to subtract.
4606 *
4607 * @remarks x86: Requires a 486 or later.
4608 */
4609DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
4610{
4611 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
4612}
4613
4614
4615/**
4616 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
4617 *
4618 * @returns The old value.
4619 * @param pi16 Pointer to the value.
4620 * @param i16 Number to subtract.
4621 *
4622 * @remarks x86: Requires a 486 or later.
4623 */
4624DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4625{
4626 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
4627}
4628
4629
4630/**
4631 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
4632 *
4633 * @returns The old value.
4634 * @param pu32 Pointer to the value.
4635 * @param u32 Number to subtract.
4636 *
4637 * @remarks x86: Requires a 486 or later.
4638 */
4639DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4640{
4641 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
4642}
4643
4644
4645/**
4646 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
4647 *
4648 * @returns The old value.
4649 * @param pi32 Pointer to the value.
4650 * @param i32 Number to subtract.
4651 *
4652 * @remarks x86: Requires a 486 or later.
4653 */
4654DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4655{
4656 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
4657}
4658
4659
4660/**
4661 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
4662 *
4663 * @returns The old value.
4664 * @param pu64 Pointer to the value.
4665 * @param u64 Number to subtract.
4666 *
4667 * @remarks x86: Requires a Pentium or later.
4668 */
4669DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4670{
4671 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
4672}
4673
4674
4675/**
4676 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
4677 *
4678 * @returns The old value.
4679 * @param pi64 Pointer to the value.
4680 * @param i64 Number to subtract.
4681 *
4682 * @remarks x86: Requires a Pentium or later.
4683 */
4684DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4685{
4686 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
4687}
4688
4689
4690/**
4691 * Atomically exchanges and subtracts to a size_t value, ordered.
4692 *
4693 * @returns The old value.
4694 * @param pcb Pointer to the size_t value.
4695 * @param cb Number to subtract.
4696 *
4697 * @remarks x86: Requires a 486 or later.
4698 */
4699DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4700{
4701#if ARCH_BITS == 64
4702 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
4703#elif ARCH_BITS == 32
4704 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
4705#elif ARCH_BITS == 16
4706 AssertCompileSize(size_t, 2);
4707 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
4708#else
4709# error "Unsupported ARCH_BITS value"
4710#endif
4711}
4712
4713
4714/**
4715 * Atomically exchanges and subtracts a value which size might differ between
4716 * platforms or compilers, ordered.
4717 *
4718 * @param pu Pointer to the variable to update.
4719 * @param uNew The value to subtract to *pu.
4720 * @param puOld Where to store the old value.
4721 *
4722 * @remarks x86: Requires a 486 or later.
4723 */
4724#define ASMAtomicSubSize(pu, uNew, puOld) \
4725 do { \
4726 switch (sizeof(*(pu))) { \
4727 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4728 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4729 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
4730 } \
4731 } while (0)
4732
4733
4734
4735/**
4736 * Atomically increment a 16-bit value, ordered.
4737 *
4738 * @returns The new value.
4739 * @param pu16 Pointer to the value to increment.
4740 * @remarks Not implemented. Just to make 16-bit code happy.
4741 *
4742 * @remarks x86: Requires a 486 or later.
4743 */
4744RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4745
4746
4747/**
4748 * Atomically increment a 32-bit value, ordered.
4749 *
4750 * @returns The new value.
4751 * @param pu32 Pointer to the value to increment.
4752 *
4753 * @remarks x86: Requires a 486 or later.
4754 */
4755#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4756RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4757#else
4758DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4759{
4760# if RT_INLINE_ASM_USES_INTRIN
4761 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
4762
4763# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4764# if RT_INLINE_ASM_GNU_STYLE
4765 uint32_t u32;
4766 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4767 : "=r" (u32)
4768 , "=m" (*pu32)
4769 : "0" (1)
4770 , "m" (*pu32)
4771 : "memory"
4772 , "cc");
4773 return u32+1;
4774# else
4775 __asm
4776 {
4777 mov eax, 1
4778# ifdef RT_ARCH_AMD64
4779 mov rdx, [pu32]
4780 lock xadd [rdx], eax
4781# else
4782 mov edx, [pu32]
4783 lock xadd [edx], eax
4784# endif
4785 mov u32, eax
4786 }
4787 return u32+1;
4788# endif
4789
4790# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4791 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
4792# if defined(RTASM_ARM64_USE_FEAT_LSE)
4793 uint32_t u32NewRet;
4794 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
4795# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4796 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4797# else
4798 RTASM_ARM_DMB_SY
4799 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4800# endif
4801 "add %w[uNewRet], %w[uNewRet], #1\n\t"
4802 : [pMem] "+Q" (*pu32)
4803 , [uNewRet] "=&r" (u32NewRet)
4804 : [uAddend] "r" ((uint32_t)1)
4805 : );
4806# else
4807 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
4808 "add %w[uNew], %w[uNew], #1\n\t",
4809 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
4810 "X" (0) /* dummy */);
4811# endif
4812 return u32NewRet;
4813
4814# else
4815 return ASMAtomicAddU32(pu32, 1) + 1;
4816# endif
4817}
4818#endif
4819
4820
4821/**
4822 * Atomically increment a signed 32-bit value, ordered.
4823 *
4824 * @returns The new value.
4825 * @param pi32 Pointer to the value to increment.
4826 *
4827 * @remarks x86: Requires a 486 or later.
4828 */
4829DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
4830{
4831 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
4832}
4833
4834
4835/**
4836 * Atomically increment a 64-bit value, ordered.
4837 *
4838 * @returns The new value.
4839 * @param pu64 Pointer to the value to increment.
4840 *
4841 * @remarks x86: Requires a Pentium or later.
4842 */
4843#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4844DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
4845#else
4846DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
4847{
4848# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4849 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
4850
4851# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4852 uint64_t u64;
4853 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4854 : "=r" (u64)
4855 , "=m" (*pu64)
4856 : "0" (1)
4857 , "m" (*pu64)
4858 : "memory"
4859 , "cc");
4860 return u64 + 1;
4861
4862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4863# if defined(RTASM_ARM64_USE_FEAT_LSE)
4864 uint64_t u64NewRet;
4865 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
4866# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4867 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
4868# else
4869 RTASM_ARM_DMB_SY
4870 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
4871# endif
4872 "add %[uNewRet], %[uNewRet], #1\n\t"
4873 : [pMem] "+Q" (*pu64)
4874 , [uNewRet] "=&r" (u64NewRet)
4875 : [uAddend] "r" ((uint64_t)1)
4876 : );
4877# else
4878 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
4879 "add %[uNew], %[uNew], #1\n\t"
4880 ,
4881 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
4882 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
4883 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
4884# endif
4885 return u64NewRet;
4886
4887# else
4888 return ASMAtomicAddU64(pu64, 1) + 1;
4889# endif
4890}
4891#endif
4892
4893
4894/**
4895 * Atomically increment a signed 64-bit value, ordered.
4896 *
4897 * @returns The new value.
4898 * @param pi64 Pointer to the value to increment.
4899 *
4900 * @remarks x86: Requires a Pentium or later.
4901 */
4902DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
4903{
4904 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
4905}
4906
4907
4908/**
4909 * Atomically increment a size_t value, ordered.
4910 *
4911 * @returns The new value.
4912 * @param pcb Pointer to the value to increment.
4913 *
4914 * @remarks x86: Requires a 486 or later.
4915 */
4916DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
4917{
4918#if ARCH_BITS == 64
4919 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
4920#elif ARCH_BITS == 32
4921 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
4922#elif ARCH_BITS == 16
4923 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
4924#else
4925# error "Unsupported ARCH_BITS value"
4926#endif
4927}
4928
4929
4930
4931/**
4932 * Atomically decrement an unsigned 32-bit value, ordered.
4933 *
4934 * @returns The new value.
4935 * @param pu16 Pointer to the value to decrement.
4936 * @remarks Not implemented. Just to make 16-bit code happy.
4937 *
4938 * @remarks x86: Requires a 486 or later.
4939 */
4940RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4941
4942
4943/**
4944 * Atomically decrement an unsigned 32-bit value, ordered.
4945 *
4946 * @returns The new value.
4947 * @param pu32 Pointer to the value to decrement.
4948 *
4949 * @remarks x86: Requires a 486 or later.
4950 */
4951#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4952RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4953#else
4954DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4955{
4956# if RT_INLINE_ASM_USES_INTRIN
4957 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
4958
4959# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4960# if RT_INLINE_ASM_GNU_STYLE
4961 uint32_t u32;
4962 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4963 : "=r" (u32)
4964 , "=m" (*pu32)
4965 : "0" (-1)
4966 , "m" (*pu32)
4967 : "memory"
4968 , "cc");
4969 return u32-1;
4970# else
4971 uint32_t u32;
4972 __asm
4973 {
4974 mov eax, -1
4975# ifdef RT_ARCH_AMD64
4976 mov rdx, [pu32]
4977 lock xadd [rdx], eax
4978# else
4979 mov edx, [pu32]
4980 lock xadd [edx], eax
4981# endif
4982 mov u32, eax
4983 }
4984 return u32-1;
4985# endif
4986
4987# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4988 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
4989# if defined(RTASM_ARM64_USE_FEAT_LSE)
4990 uint32_t u32NewRet;
4991 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
4992# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4993 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4994# else
4995 RTASM_ARM_DMB_SY
4996 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4997# endif
4998 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
4999 : [pMem] "+Q" (*pu32)
5000 , [uNewRet] "=&r" (u32NewRet)
5001 : [uAddend] "r" (~(uint32_t)0)
5002 : );
5003# else
5004 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5005 "sub %w[uNew], %w[uNew], #1\n\t",
5006 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5007 "X" (0) /* dummy */);
5008# endif
5009 return u32NewRet;
5010
5011# else
5012 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5013# endif
5014}
5015#endif
5016
5017
5018/**
5019 * Atomically decrement a signed 32-bit value, ordered.
5020 *
5021 * @returns The new value.
5022 * @param pi32 Pointer to the value to decrement.
5023 *
5024 * @remarks x86: Requires a 486 or later.
5025 */
5026DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5027{
5028 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5029}
5030
5031
5032/**
5033 * Atomically decrement an unsigned 64-bit value, ordered.
5034 *
5035 * @returns The new value.
5036 * @param pu64 Pointer to the value to decrement.
5037 *
5038 * @remarks x86: Requires a Pentium or later.
5039 */
5040#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5041RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5042#else
5043DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5044{
5045# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5046 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5047
5048# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5049 uint64_t u64;
5050 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5051 : "=r" (u64)
5052 , "=m" (*pu64)
5053 : "0" (~(uint64_t)0)
5054 , "m" (*pu64)
5055 : "memory"
5056 , "cc");
5057 return u64-1;
5058
5059# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5060# if defined(RTASM_ARM64_USE_FEAT_LSE)
5061 uint64_t u64NewRet;
5062 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5063# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5064 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5065# else
5066 RTASM_ARM_DMB_SY
5067 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5068# endif
5069 "sub %[uNewRet], %[uNewRet], #1\n\t"
5070 : [pMem] "+Q" (*pu64)
5071 , [uNewRet] "=&r" (u64NewRet)
5072 : [uAddend] "r" (~(uint64_t)0)
5073 : );
5074# else
5075 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5076 "sub %[uNew], %[uNew], #1\n\t"
5077 ,
5078 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5079 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5080 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5081# endif
5082 return u64NewRet;
5083
5084# else
5085 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5086# endif
5087}
5088#endif
5089
5090
5091/**
5092 * Atomically decrement a signed 64-bit value, ordered.
5093 *
5094 * @returns The new value.
5095 * @param pi64 Pointer to the value to decrement.
5096 *
5097 * @remarks x86: Requires a Pentium or later.
5098 */
5099DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5100{
5101 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5102}
5103
5104
5105/**
5106 * Atomically decrement a size_t value, ordered.
5107 *
5108 * @returns The new value.
5109 * @param pcb Pointer to the value to decrement.
5110 *
5111 * @remarks x86: Requires a 486 or later.
5112 */
5113DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5114{
5115#if ARCH_BITS == 64
5116 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5117#elif ARCH_BITS == 32
5118 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5119#elif ARCH_BITS == 16
5120 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5121#else
5122# error "Unsupported ARCH_BITS value"
5123#endif
5124}
5125
5126
5127/**
5128 * Atomically Or an unsigned 32-bit value, ordered.
5129 *
5130 * @param pu32 Pointer to the pointer variable to OR u32 with.
5131 * @param u32 The value to OR *pu32 with.
5132 *
5133 * @remarks x86: Requires a 386 or later.
5134 */
5135#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5136RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5137#else
5138DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5139{
5140# if RT_INLINE_ASM_USES_INTRIN
5141 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5142
5143# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5144# if RT_INLINE_ASM_GNU_STYLE
5145 __asm__ __volatile__("lock; orl %1, %0\n\t"
5146 : "=m" (*pu32)
5147 : "ir" (u32)
5148 , "m" (*pu32)
5149 : "cc");
5150# else
5151 __asm
5152 {
5153 mov eax, [u32]
5154# ifdef RT_ARCH_AMD64
5155 mov rdx, [pu32]
5156 lock or [rdx], eax
5157# else
5158 mov edx, [pu32]
5159 lock or [edx], eax
5160# endif
5161 }
5162# endif
5163
5164# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5165# if defined(RTASM_ARM64_USE_FEAT_LSE)
5166# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5167 uint32_t u32Spill;
5168 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5169 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5170 : [pMem] "+Q" (*pu32)
5171 , [uSpill] "=&r" (u32Spill)
5172 : [fBitsToSet] "r" (u32)
5173 : );
5174# else
5175 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5176 RTASM_ARM_DMB_SY
5177 "stset %w[fBitsToSet], %[pMem]\n\t"
5178 : [pMem] "+Q" (*pu32)
5179 : [fBitsToSet] "r" (u32)
5180 : );
5181# endif
5182# else
5183 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5184 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5185 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5186 "orr %[uNew], %[uNew], %[uVal]\n\t",
5187 [uVal] "r" (u32));
5188
5189# endif
5190# else
5191# error "Port me"
5192# endif
5193}
5194#endif
5195
5196
5197/**
5198 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5199 * fallback).
5200 *
5201 * @returns Old value.
5202 * @param pu32 Pointer to the variable to OR @a u32 with.
5203 * @param u32 The value to OR @a *pu32 with.
5204 */
5205DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5206{
5207#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5208# if defined(RTASM_ARM64_USE_FEAT_LSE)
5209 uint32_t u32OldRet;
5210 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5211# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5212 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5213# else
5214 RTASM_ARM_DMB_SY
5215 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5216# endif
5217 : [pMem] "+Q" (*pu32)
5218 , [uOldRet] "=&r" (u32OldRet)
5219 : [fBitsToSet] "r" (u32)
5220 : );
5221# else
5222 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5223 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5224 "orr %[uNew], %[uOld], %[uVal]\n\t",
5225 [uVal] "r" (u32));
5226# endif
5227 return u32OldRet;
5228
5229#else
5230 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5231 uint32_t u32New;
5232 do
5233 u32New = u32RetOld | u32;
5234 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5235 return u32RetOld;
5236#endif
5237}
5238
5239
5240/**
5241 * Atomically Or a signed 32-bit value, ordered.
5242 *
5243 * @param pi32 Pointer to the pointer variable to OR u32 with.
5244 * @param i32 The value to OR *pu32 with.
5245 *
5246 * @remarks x86: Requires a 386 or later.
5247 */
5248DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5249{
5250 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5251}
5252
5253
5254/**
5255 * Atomically Or an unsigned 64-bit value, ordered.
5256 *
5257 * @param pu64 Pointer to the pointer variable to OR u64 with.
5258 * @param u64 The value to OR *pu64 with.
5259 *
5260 * @remarks x86: Requires a Pentium or later.
5261 */
5262#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5263DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5264#else
5265DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5266{
5267# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5268 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5269
5270# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5271 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5272 : "=m" (*pu64)
5273 : "r" (u64)
5274 , "m" (*pu64)
5275 : "cc");
5276
5277# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5278# if defined(RTASM_ARM64_USE_FEAT_LSE)
5279# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5280 uint64_t u64Spill;
5281 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5282 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5283 : [pMem] "+Q" (*pu64)
5284 , [uSpill] "=&r" (u64Spill)
5285 : [fBitsToSet] "r" (u64)
5286 : );
5287# else
5288 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5289 RTASM_ARM_DMB_SY
5290 "stset %[fBitsToSet], %[pMem]\n\t"
5291 : [pMem] "+Q" (*pu64)
5292 : [fBitsToSet] "r" (u64)
5293 : );
5294# endif
5295# else
5296 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5297 "orr %[uNew], %[uNew], %[uVal]\n\t"
5298 ,
5299 "orr %[uNew], %[uNew], %[uVal]\n\t"
5300 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5301 [uVal] "r" (u64));
5302# endif
5303
5304# else
5305 for (;;)
5306 {
5307 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5308 uint64_t u64New = u64Old | u64;
5309 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5310 break;
5311 ASMNopPause();
5312 }
5313# endif
5314}
5315#endif
5316
5317
5318/**
5319 * Atomically Or a signed 64-bit value, ordered.
5320 *
5321 * @param pi64 Pointer to the pointer variable to OR u64 with.
5322 * @param i64 The value to OR *pu64 with.
5323 *
5324 * @remarks x86: Requires a Pentium or later.
5325 */
5326DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5327{
5328 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5329}
5330
5331
5332/**
5333 * Atomically And an unsigned 32-bit value, ordered.
5334 *
5335 * @param pu32 Pointer to the pointer variable to AND u32 with.
5336 * @param u32 The value to AND *pu32 with.
5337 *
5338 * @remarks x86: Requires a 386 or later.
5339 */
5340#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5341RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5342#else
5343DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5344{
5345# if RT_INLINE_ASM_USES_INTRIN
5346 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5347
5348# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5349# if RT_INLINE_ASM_GNU_STYLE
5350 __asm__ __volatile__("lock; andl %1, %0\n\t"
5351 : "=m" (*pu32)
5352 : "ir" (u32)
5353 , "m" (*pu32)
5354 : "cc");
5355# else
5356 __asm
5357 {
5358 mov eax, [u32]
5359# ifdef RT_ARCH_AMD64
5360 mov rdx, [pu32]
5361 lock and [rdx], eax
5362# else
5363 mov edx, [pu32]
5364 lock and [edx], eax
5365# endif
5366 }
5367# endif
5368
5369# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5370# if defined(RTASM_ARM64_USE_FEAT_LSE)
5371# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5372 uint32_t u32Spill;
5373 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5374 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5375 : [pMem] "+Q" (*pu32)
5376 , [uSpill] "=&r" (u32Spill)
5377 : [fBitsToClear] "r" (~u32)
5378 : );
5379# else
5380 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5381 RTASM_ARM_DMB_SY
5382 "stclr %w[fBitsToClear], %[pMem]\n\t"
5383 : [pMem] "+Q" (*pu32)
5384 : [fBitsToClear] "r" (~u32)
5385 : );
5386# endif
5387# else
5388 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5389 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5390 "and %[uNew], %[uNew], %[uVal]\n\t",
5391 [uVal] "r" (u32));
5392
5393# endif
5394# else
5395# error "Port me"
5396# endif
5397}
5398#endif
5399
5400
5401/**
5402 * Atomically AND an unsigned 32-bit value, ordered, extended version.
5403 *
5404 * @returns Old value.
5405 * @param pu32 Pointer to the variable to AND @a u32 with.
5406 * @param u32 The value to AND @a *pu32 with.
5407 */
5408DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5409{
5410#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5411# if defined(RTASM_ARM64_USE_FEAT_LSE)
5412 uint32_t u32OldRet;
5413 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5414# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5415 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5416# else
5417 RTASM_ARM_DMB_SY
5418 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5419# endif
5420 : [pMem] "+Q" (*pu32)
5421 , [uOldRet] "=&r" (u32OldRet)
5422 : [fBitsToClear] "r" (~u32)
5423 : );
5424# else
5425 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
5426 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5427 "and %[uNew], %[uOld], %[uVal]\n\t",
5428 [uVal] "r" (u32));
5429# endif
5430 return u32OldRet;
5431
5432#else
5433 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5434 uint32_t u32New;
5435 do
5436 u32New = u32RetOld & u32;
5437 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5438 return u32RetOld;
5439#endif
5440}
5441
5442
5443/**
5444 * Atomically And a signed 32-bit value, ordered.
5445 *
5446 * @param pi32 Pointer to the pointer variable to AND i32 with.
5447 * @param i32 The value to AND *pi32 with.
5448 *
5449 * @remarks x86: Requires a 386 or later.
5450 */
5451DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5452{
5453 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5454}
5455
5456
5457/**
5458 * Atomically And an unsigned 64-bit value, ordered.
5459 *
5460 * @param pu64 Pointer to the pointer variable to AND u64 with.
5461 * @param u64 The value to AND *pu64 with.
5462 *
5463 * @remarks x86: Requires a Pentium or later.
5464 */
5465#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5466DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5467#else
5468DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5469{
5470# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5471 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
5472
5473# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5474 __asm__ __volatile__("lock; andq %1, %0\n\t"
5475 : "=m" (*pu64)
5476 : "r" (u64)
5477 , "m" (*pu64)
5478 : "cc");
5479
5480# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5481# if defined(RTASM_ARM64_USE_FEAT_LSE)
5482# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5483 uint64_t u64Spill;
5484 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5485 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
5486 : [pMem] "+Q" (*pu64)
5487 , [uSpill] "=&r" (u64Spill)
5488 : [fBitsToClear] "r" (~u64)
5489 : );
5490# else
5491 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5492 RTASM_ARM_DMB_SY
5493 "stclr %[fBitsToClear], %[pMem]\n\t"
5494 : [pMem] "+Q" (*pu64)
5495 : [fBitsToClear] "r" (~u64)
5496 : );
5497# endif
5498# else
5499 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
5500 "and %[uNew], %[uNew], %[uVal]\n\t"
5501 ,
5502 "and %[uNew], %[uNew], %[uVal]\n\t"
5503 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5504 [uVal] "r" (u64));
5505# endif
5506
5507# else
5508 for (;;)
5509 {
5510 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5511 uint64_t u64New = u64Old & u64;
5512 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5513 break;
5514 ASMNopPause();
5515 }
5516# endif
5517}
5518#endif
5519
5520
5521/**
5522 * Atomically And a signed 64-bit value, ordered.
5523 *
5524 * @param pi64 Pointer to the pointer variable to AND i64 with.
5525 * @param i64 The value to AND *pi64 with.
5526 *
5527 * @remarks x86: Requires a Pentium or later.
5528 */
5529DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5530{
5531 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5532}
5533
5534
5535/**
5536 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
5537 *
5538 * @param pu32 Pointer to the variable to XOR @a u32 with.
5539 * @param u32 The value to XOR @a *pu32 with.
5540 *
5541 * @remarks x86: Requires a 386 or later.
5542 */
5543#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5544RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5545#else
5546DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5547{
5548# if RT_INLINE_ASM_USES_INTRIN
5549 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
5550
5551# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5552# if RT_INLINE_ASM_GNU_STYLE
5553 __asm__ __volatile__("lock; xorl %1, %0\n\t"
5554 : "=m" (*pu32)
5555 : "ir" (u32)
5556 , "m" (*pu32)
5557 : "cc");
5558# else
5559 __asm
5560 {
5561 mov eax, [u32]
5562# ifdef RT_ARCH_AMD64
5563 mov rdx, [pu32]
5564 lock xor [rdx], eax
5565# else
5566 mov edx, [pu32]
5567 lock xor [edx], eax
5568# endif
5569 }
5570# endif
5571
5572# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5573# if defined(RTASM_ARM64_USE_FEAT_LSE)
5574# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5575 uint32_t u32Spill;
5576 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5577 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
5578 : [pMem] "+Q" (*pu32)
5579 , [uSpill] "=&r" (u32Spill)
5580 : [fBitMask] "r" (u32)
5581 : );
5582# else
5583 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5584 RTASM_ARM_DMB_SY
5585 "steor %w[fBitMask], %[pMem]\n\t"
5586 : [pMem] "+Q" (*pu32)
5587 : [fBitMask] "r" (u32)
5588 : );
5589# endif
5590# else
5591 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
5592 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
5593 "eor %[uNew], %[uNew], %[uVal]\n\t",
5594 [uVal] "r" (u32));
5595# endif
5596
5597# else
5598# error "Port me"
5599# endif
5600}
5601#endif
5602
5603
5604/**
5605 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
5606 * extended version (for bitmaps).
5607 *
5608 * @returns Old value.
5609 * @param pu32 Pointer to the variable to XOR @a u32 with.
5610 * @param u32 The value to XOR @a *pu32 with.
5611 */
5612DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5613{
5614#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5615# if defined(RTASM_ARM64_USE_FEAT_LSE)
5616 uint32_t u32OldRet;
5617 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
5618# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5619 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5620# else
5621 RTASM_ARM_DMB_SY
5622 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5623# endif
5624 : [pMem] "+Q" (*pu32)
5625 , [uOldRet] "=&r" (u32OldRet)
5626 : [fBitMask] "r" (u32)
5627 : );
5628# else
5629 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
5630 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
5631 "eor %[uNew], %[uOld], %[uVal]\n\t",
5632 [uVal] "r" (u32));
5633# endif
5634 return u32OldRet;
5635
5636#else
5637 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5638 uint32_t u32New;
5639 do
5640 u32New = u32RetOld ^ u32;
5641 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5642 return u32RetOld;
5643#endif
5644}
5645
5646
5647/**
5648 * Atomically XOR a signed 32-bit value, ordered.
5649 *
5650 * @param pi32 Pointer to the variable to XOR i32 with.
5651 * @param i32 The value to XOR *pi32 with.
5652 *
5653 * @remarks x86: Requires a 386 or later.
5654 */
5655DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5656{
5657 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5658}
5659
5660
5661/**
5662 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
5663 *
5664 * @param pu32 Pointer to the pointer variable to OR u32 with.
5665 * @param u32 The value to OR *pu32 with.
5666 *
5667 * @remarks x86: Requires a 386 or later.
5668 */
5669#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5670RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5671#else
5672DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5673{
5674# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5675# if RT_INLINE_ASM_GNU_STYLE
5676 __asm__ __volatile__("orl %1, %0\n\t"
5677 : "=m" (*pu32)
5678 : "ir" (u32)
5679 , "m" (*pu32)
5680 : "cc");
5681# else
5682 __asm
5683 {
5684 mov eax, [u32]
5685# ifdef RT_ARCH_AMD64
5686 mov rdx, [pu32]
5687 or [rdx], eax
5688# else
5689 mov edx, [pu32]
5690 or [edx], eax
5691# endif
5692 }
5693# endif
5694
5695# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5696 /* M1 benchmark: stset=1974 vs non-lse=6271 */
5697# if defined(RTASM_ARM64_USE_FEAT_LSE)
5698 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
5699 "stset %w[fBitsToSet], %[pMem]\n\t"
5700 : [pMem] "+Q" (*pu32)
5701 : [fBitsToSet] "r" (u32)
5702 : );
5703# else
5704 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
5705 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5706 "orr %[uNew], %[uNew], %[uVal]\n\t",
5707 [uVal] "r" (u32));
5708# endif
5709
5710# else
5711# error "Port me"
5712# endif
5713}
5714#endif
5715
5716
5717/**
5718 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
5719 * extended version (for bitmap fallback).
5720 *
5721 * @returns Old value.
5722 * @param pu32 Pointer to the variable to OR @a u32 with.
5723 * @param u32 The value to OR @a *pu32 with.
5724 */
5725DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5726{
5727#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5728# if defined(RTASM_ARM64_USE_FEAT_LSE)
5729 uint32_t u32OldRet;
5730 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5731 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5732 : [pMem] "+Q" (*pu32)
5733 , [uOldRet] "=&r" (u32OldRet)
5734 : [fBitsToSet] "r" (u32)
5735 : );
5736# else
5737 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
5738 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5739 "orr %[uNew], %[uOld], %[uVal]\n\t",
5740 [uVal] "r" (u32));
5741# endif
5742 return u32OldRet;
5743
5744#else
5745 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5746#endif
5747}
5748
5749
5750/**
5751 * Atomically OR a signed 32-bit value, unordered.
5752 *
5753 * @param pi32 Pointer to the pointer variable to OR u32 with.
5754 * @param i32 The value to OR *pu32 with.
5755 *
5756 * @remarks x86: Requires a 386 or later.
5757 */
5758DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5759{
5760 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5761}
5762
5763
5764/**
5765 * Atomically OR an unsigned 64-bit value, unordered.
5766 *
5767 * @param pu64 Pointer to the pointer variable to OR u64 with.
5768 * @param u64 The value to OR *pu64 with.
5769 *
5770 * @remarks x86: Requires a Pentium or later.
5771 */
5772#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5773DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5774#else
5775DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5776{
5777# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5778 __asm__ __volatile__("orq %1, %q0\n\t"
5779 : "=m" (*pu64)
5780 : "r" (u64)
5781 , "m" (*pu64)
5782 : "cc");
5783
5784# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5785# if defined(RTASM_ARM64_USE_FEAT_LSE)
5786 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
5787 "stset %[fBitsToSet], %[pMem]\n\t"
5788 : [pMem] "+Q" (*pu64)
5789 : [fBitsToSet] "r" (u64)
5790 : );
5791# else
5792 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
5793 "orr %[uNew], %[uNew], %[uVal]\n\t"
5794 ,
5795 "orr %[uNew], %[uNew], %[uVal]\n\t"
5796 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5797 [uVal] "r" (u64));
5798# endif
5799
5800# else
5801 for (;;)
5802 {
5803 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5804 uint64_t u64New = u64Old | u64;
5805 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5806 break;
5807 ASMNopPause();
5808 }
5809# endif
5810}
5811#endif
5812
5813
5814/**
5815 * Atomically Or a signed 64-bit value, unordered.
5816 *
5817 * @param pi64 Pointer to the pointer variable to OR u64 with.
5818 * @param i64 The value to OR *pu64 with.
5819 *
5820 * @remarks x86: Requires a Pentium or later.
5821 */
5822DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5823{
5824 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5825}
5826
5827
5828/**
5829 * Atomically And an unsigned 32-bit value, unordered.
5830 *
5831 * @param pu32 Pointer to the pointer variable to AND u32 with.
5832 * @param u32 The value to AND *pu32 with.
5833 *
5834 * @remarks x86: Requires a 386 or later.
5835 */
5836#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5837RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5838#else
5839DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5840{
5841# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5842# if RT_INLINE_ASM_GNU_STYLE
5843 __asm__ __volatile__("andl %1, %0\n\t"
5844 : "=m" (*pu32)
5845 : "ir" (u32)
5846 , "m" (*pu32)
5847 : "cc");
5848# else
5849 __asm
5850 {
5851 mov eax, [u32]
5852# ifdef RT_ARCH_AMD64
5853 mov rdx, [pu32]
5854 and [rdx], eax
5855# else
5856 mov edx, [pu32]
5857 and [edx], eax
5858# endif
5859 }
5860# endif
5861
5862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5863 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
5864# if defined(RTASM_ARM64_USE_FEAT_LSE)
5865 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
5866 "stclr %w[fBitsToClear], %[pMem]\n\t"
5867 : [pMem] "+Q" (*pu32)
5868 : [fBitsToClear] "r" (~u32)
5869 : );
5870# else
5871 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
5872 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5873 "and %[uNew], %[uNew], %[uVal]\n\t",
5874 [uVal] "r" (u32));
5875# endif
5876
5877# else
5878# error "Port me"
5879# endif
5880}
5881#endif
5882
5883
5884/**
5885 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
5886 * bitmap fallback).
5887 *
5888 * @returns Old value.
5889 * @param pu32 Pointer to the pointer to AND @a u32 with.
5890 * @param u32 The value to AND @a *pu32 with.
5891 */
5892DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5893{
5894#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5895# if defined(RTASM_ARM64_USE_FEAT_LSE)
5896 uint32_t u32OldRet;
5897 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5898 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5899 : [pMem] "+Q" (*pu32)
5900 , [uOldRet] "=&r" (u32OldRet)
5901 : [fBitsToClear] "r" (~u32)
5902 : );
5903# else
5904 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
5905 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5906 "and %[uNew], %[uOld], %[uVal]\n\t",
5907 [uVal] "r" (u32));
5908# endif
5909 return u32OldRet;
5910
5911#else
5912 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5913#endif
5914}
5915
5916
5917/**
5918 * Atomically And a signed 32-bit value, unordered.
5919 *
5920 * @param pi32 Pointer to the pointer variable to AND i32 with.
5921 * @param i32 The value to AND *pi32 with.
5922 *
5923 * @remarks x86: Requires a 386 or later.
5924 */
5925DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5926{
5927 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5928}
5929
5930
5931/**
5932 * Atomically And an unsigned 64-bit value, unordered.
5933 *
5934 * @param pu64 Pointer to the pointer variable to AND u64 with.
5935 * @param u64 The value to AND *pu64 with.
5936 *
5937 * @remarks x86: Requires a Pentium or later.
5938 */
5939#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5940DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5941#else
5942DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5943{
5944# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5945 __asm__ __volatile__("andq %1, %0\n\t"
5946 : "=m" (*pu64)
5947 : "r" (u64)
5948 , "m" (*pu64)
5949 : "cc");
5950
5951# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5952# if defined(RTASM_ARM64_USE_FEAT_LSE)
5953 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
5954 "stclr %[fBitsToClear], %[pMem]\n\t"
5955 : [pMem] "+Q" (*pu64)
5956 : [fBitsToClear] "r" (~u64)
5957 : );
5958# else
5959 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
5960 "and %[uNew], %[uNew], %[uVal]\n\t"
5961 ,
5962 "and %[uNew], %[uNew], %[uVal]\n\t"
5963 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5964 [uVal] "r" (u64));
5965# endif
5966
5967# else
5968 for (;;)
5969 {
5970 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5971 uint64_t u64New = u64Old & u64;
5972 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5973 break;
5974 ASMNopPause();
5975 }
5976# endif
5977}
5978#endif
5979
5980
5981/**
5982 * Atomically And a signed 64-bit value, unordered.
5983 *
5984 * @param pi64 Pointer to the pointer variable to AND i64 with.
5985 * @param i64 The value to AND *pi64 with.
5986 *
5987 * @remarks x86: Requires a Pentium or later.
5988 */
5989DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5990{
5991 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5992}
5993
5994
5995/**
5996 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
5997 *
5998 * @param pu32 Pointer to the variable to XOR @a u32 with.
5999 * @param u32 The value to OR @a *pu32 with.
6000 *
6001 * @remarks x86: Requires a 386 or later.
6002 */
6003#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6004RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6005#else
6006DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6007{
6008# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6009# if RT_INLINE_ASM_GNU_STYLE
6010 __asm__ __volatile__("xorl %1, %0\n\t"
6011 : "=m" (*pu32)
6012 : "ir" (u32)
6013 , "m" (*pu32)
6014 : "cc");
6015# else
6016 __asm
6017 {
6018 mov eax, [u32]
6019# ifdef RT_ARCH_AMD64
6020 mov rdx, [pu32]
6021 xor [rdx], eax
6022# else
6023 mov edx, [pu32]
6024 xor [edx], eax
6025# endif
6026 }
6027# endif
6028
6029# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6030# if defined(RTASM_ARM64_USE_FEAT_LSE)
6031 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6032 "steor %w[fBitMask], %[pMem]\n\t"
6033 : [pMem] "+Q" (*pu32)
6034 : [fBitMask] "r" (u32)
6035 : );
6036# else
6037 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6038 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6039 "eor %[uNew], %[uNew], %[uVal]\n\t",
6040 [uVal] "r" (u32));
6041# endif
6042
6043# else
6044# error "Port me"
6045# endif
6046}
6047#endif
6048
6049
6050/**
6051 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6052 * extended version (for bitmap fallback).
6053 *
6054 * @returns Old value.
6055 * @param pu32 Pointer to the variable to XOR @a u32 with.
6056 * @param u32 The value to OR @a *pu32 with.
6057 */
6058DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6059{
6060#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6061# if defined(RTASM_ARM64_USE_FEAT_LSE)
6062 uint32_t u32OldRet;
6063 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6064 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6065 : [pMem] "+Q" (*pu32)
6066 , [uOldRet] "=&r" (u32OldRet)
6067 : [fBitMask] "r" (u32)
6068 : );
6069# else
6070 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6071 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6072 "eor %[uNew], %[uOld], %[uVal]\n\t",
6073 [uVal] "r" (u32));
6074# endif
6075 return u32OldRet;
6076
6077#else
6078 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6079#endif
6080}
6081
6082
6083/**
6084 * Atomically XOR a signed 32-bit value, unordered.
6085 *
6086 * @param pi32 Pointer to the variable to XOR @a u32 with.
6087 * @param i32 The value to XOR @a *pu32 with.
6088 *
6089 * @remarks x86: Requires a 386 or later.
6090 */
6091DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6092{
6093 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6094}
6095
6096
6097/**
6098 * Atomically increment an unsigned 32-bit value, unordered.
6099 *
6100 * @returns the new value.
6101 * @param pu32 Pointer to the variable to increment.
6102 *
6103 * @remarks x86: Requires a 486 or later.
6104 */
6105#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6106RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6107#else
6108DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6109{
6110# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6111 uint32_t u32;
6112# if RT_INLINE_ASM_GNU_STYLE
6113 __asm__ __volatile__("xaddl %0, %1\n\t"
6114 : "=r" (u32)
6115 , "=m" (*pu32)
6116 : "0" (1)
6117 , "m" (*pu32)
6118 : "memory" /** @todo why 'memory'? */
6119 , "cc");
6120 return u32 + 1;
6121# else
6122 __asm
6123 {
6124 mov eax, 1
6125# ifdef RT_ARCH_AMD64
6126 mov rdx, [pu32]
6127 xadd [rdx], eax
6128# else
6129 mov edx, [pu32]
6130 xadd [edx], eax
6131# endif
6132 mov u32, eax
6133 }
6134 return u32 + 1;
6135# endif
6136
6137# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6138 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6139# if defined(RTASM_ARM64_USE_FEAT_LSE)
6140 uint32_t u32NewRet;
6141 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6142 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6143 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6144 : [pMem] "+Q" (*pu32)
6145 , [uNewRet] "=&r" (u32NewRet)
6146 : [uAddend] "r" ((uint32_t)1)
6147 : );
6148# else
6149 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6150 "add %w[uNew], %w[uNew], #1\n\t",
6151 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6152 "X" (0) /* dummy */);
6153# endif
6154 return u32NewRet;
6155
6156# else
6157# error "Port me"
6158# endif
6159}
6160#endif
6161
6162
6163/**
6164 * Atomically decrement an unsigned 32-bit value, unordered.
6165 *
6166 * @returns the new value.
6167 * @param pu32 Pointer to the variable to decrement.
6168 *
6169 * @remarks x86: Requires a 486 or later.
6170 */
6171#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6172RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6173#else
6174DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6175{
6176# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6177 uint32_t u32;
6178# if RT_INLINE_ASM_GNU_STYLE
6179 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6180 : "=r" (u32)
6181 , "=m" (*pu32)
6182 : "0" (-1)
6183 , "m" (*pu32)
6184 : "memory"
6185 , "cc");
6186 return u32 - 1;
6187# else
6188 __asm
6189 {
6190 mov eax, -1
6191# ifdef RT_ARCH_AMD64
6192 mov rdx, [pu32]
6193 xadd [rdx], eax
6194# else
6195 mov edx, [pu32]
6196 xadd [edx], eax
6197# endif
6198 mov u32, eax
6199 }
6200 return u32 - 1;
6201# endif
6202
6203# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6204 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6205# if defined(RTASM_ARM64_USE_FEAT_LSE)
6206 uint32_t u32NewRet;
6207 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6208 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6209 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6210 : [pMem] "+Q" (*pu32)
6211 , [uNewRet] "=&r" (u32NewRet)
6212 : [uAddend] "r" (~(uint32_t)0)
6213 : );
6214# else
6215 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6216 "sub %w[uNew], %w[uNew], #1\n\t",
6217 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6218 "X" (0) /* dummy */);
6219# endif
6220 return u32NewRet;
6221
6222# else
6223# error "Port me"
6224# endif
6225}
6226#endif
6227
6228/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6229 * header as it's a common reason for including asm.h. */
6230
6231
6232/**
6233 * Reverse the byte order of the given 16-bit integer.
6234 *
6235 * @returns Revert
6236 * @param u16 16-bit integer value.
6237 */
6238#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6239RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6240#else
6241DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6242{
6243# if RT_INLINE_ASM_USES_INTRIN
6244 return _byteswap_ushort(u16);
6245
6246# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6247# if RT_INLINE_ASM_GNU_STYLE
6248 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6249# else
6250 _asm
6251 {
6252 mov ax, [u16]
6253 ror ax, 8
6254 mov [u16], ax
6255 }
6256# endif
6257 return u16;
6258
6259# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6260 uint32_t u32Ret;
6261 __asm__ __volatile__(
6262# if defined(RT_ARCH_ARM64)
6263 "rev16 %w[uRet], %w[uVal]\n\t"
6264# else
6265 "rev16 %[uRet], %[uVal]\n\t"
6266# endif
6267 : [uRet] "=r" (u32Ret)
6268 : [uVal] "r" (u16));
6269 return (uint16_t)u32Ret;
6270
6271# else
6272# error "Port me"
6273# endif
6274}
6275#endif
6276
6277
6278/**
6279 * Reverse the byte order of the given 32-bit integer.
6280 *
6281 * @returns Revert
6282 * @param u32 32-bit integer value.
6283 */
6284#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6285RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6286#else
6287DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6288{
6289# if RT_INLINE_ASM_USES_INTRIN
6290 return _byteswap_ulong(u32);
6291
6292# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6293# if RT_INLINE_ASM_GNU_STYLE
6294 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6295# else
6296 _asm
6297 {
6298 mov eax, [u32]
6299 bswap eax
6300 mov [u32], eax
6301 }
6302# endif
6303 return u32;
6304
6305# elif defined(RT_ARCH_ARM64)
6306 uint64_t u64Ret;
6307 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6308 : [uRet] "=r" (u64Ret)
6309 : [uVal] "r" ((uint64_t)u32));
6310 return (uint32_t)u64Ret;
6311
6312# elif defined(RT_ARCH_ARM32)
6313 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6314 : [uRet] "=r" (u32)
6315 : [uVal] "[uRet]" (u32));
6316 return u32;
6317
6318# else
6319# error "Port me"
6320# endif
6321}
6322#endif
6323
6324
6325/**
6326 * Reverse the byte order of the given 64-bit integer.
6327 *
6328 * @returns Revert
6329 * @param u64 64-bit integer value.
6330 */
6331DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
6332{
6333#if defined(RT_ARCH_AMD64) && RT_INLINE_ASM_USES_INTRIN
6334 return _byteswap_uint64(u64);
6335
6336# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6337 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
6338 return u64;
6339
6340# elif defined(RT_ARCH_ARM64)
6341 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6342 : [uRet] "=r" (u64)
6343 : [uVal] "[uRet]" (u64));
6344 return u64;
6345
6346#else
6347 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
6348 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
6349#endif
6350}
6351
6352
6353
6354/** @defgroup grp_inline_bits Bitmap Operations
6355 *
6356 * @todo Move these into a separate header, with standard IPRT prefix
6357 * (RTBitmapXxx). Move the more complex (searched) stuff into C source
6358 * files.
6359 *
6360 * @{
6361 */
6362
6363
6364/**
6365 * Sets a bit in a bitmap.
6366 *
6367 * @param pvBitmap Pointer to the bitmap (little endian). This should be
6368 * 32-bit aligned.
6369 * @param iBit The bit to set.
6370 *
6371 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6372 * However, doing so will yield better performance as well as avoiding
6373 * traps accessing the last bits in the bitmap.
6374 */
6375#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6376RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6377#else
6378DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6379{
6380# if RT_INLINE_ASM_USES_INTRIN
6381 _bittestandset((long RT_FAR *)pvBitmap, iBit);
6382
6383# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6384# if RT_INLINE_ASM_GNU_STYLE
6385 __asm__ __volatile__("btsl %1, %0"
6386 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6387 : "Ir" (iBit)
6388 , "m" (*(volatile long RT_FAR *)pvBitmap)
6389 : "memory"
6390 , "cc");
6391# else
6392 __asm
6393 {
6394# ifdef RT_ARCH_AMD64
6395 mov rax, [pvBitmap]
6396 mov edx, [iBit]
6397 bts [rax], edx
6398# else
6399 mov eax, [pvBitmap]
6400 mov edx, [iBit]
6401 bts [eax], edx
6402# endif
6403 }
6404# endif
6405
6406# else
6407 int32_t offBitmap = iBit / 32;
6408 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6409 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6410# endif
6411}
6412#endif
6413
6414
6415/**
6416 * Atomically sets a bit in a bitmap, ordered.
6417 *
6418 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6419 * aligned, otherwise the memory access isn't atomic!
6420 * @param iBit The bit to set.
6421 *
6422 * @remarks x86: Requires a 386 or later.
6423 */
6424#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6425RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6426#else
6427DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6428{
6429 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6430# if RT_INLINE_ASM_USES_INTRIN
6431 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
6432# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6433# if RT_INLINE_ASM_GNU_STYLE
6434 __asm__ __volatile__("lock; btsl %1, %0"
6435 : "=m" (*(volatile long *)pvBitmap)
6436 : "Ir" (iBit)
6437 , "m" (*(volatile long *)pvBitmap)
6438 : "memory"
6439 , "cc");
6440# else
6441 __asm
6442 {
6443# ifdef RT_ARCH_AMD64
6444 mov rax, [pvBitmap]
6445 mov edx, [iBit]
6446 lock bts [rax], edx
6447# else
6448 mov eax, [pvBitmap]
6449 mov edx, [iBit]
6450 lock bts [eax], edx
6451# endif
6452 }
6453# endif
6454
6455# else
6456 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6457# endif
6458}
6459#endif
6460
6461
6462/**
6463 * Clears a bit in a bitmap.
6464 *
6465 * @param pvBitmap Pointer to the bitmap (little endian).
6466 * @param iBit The bit to clear.
6467 *
6468 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6469 * However, doing so will yield better performance as well as avoiding
6470 * traps accessing the last bits in the bitmap.
6471 */
6472#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6473RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6474#else
6475DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6476{
6477# if RT_INLINE_ASM_USES_INTRIN
6478 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
6479
6480# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6481# if RT_INLINE_ASM_GNU_STYLE
6482 __asm__ __volatile__("btrl %1, %0"
6483 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6484 : "Ir" (iBit)
6485 , "m" (*(volatile long RT_FAR *)pvBitmap)
6486 : "memory"
6487 , "cc");
6488# else
6489 __asm
6490 {
6491# ifdef RT_ARCH_AMD64
6492 mov rax, [pvBitmap]
6493 mov edx, [iBit]
6494 btr [rax], edx
6495# else
6496 mov eax, [pvBitmap]
6497 mov edx, [iBit]
6498 btr [eax], edx
6499# endif
6500 }
6501# endif
6502
6503# else
6504 int32_t offBitmap = iBit / 32;
6505 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6506 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6507# endif
6508}
6509#endif
6510
6511
6512/**
6513 * Atomically clears a bit in a bitmap, ordered.
6514 *
6515 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6516 * aligned, otherwise the memory access isn't atomic!
6517 * @param iBit The bit to toggle set.
6518 *
6519 * @remarks No memory barrier, take care on smp.
6520 * @remarks x86: Requires a 386 or later.
6521 */
6522#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6523RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6524#else
6525DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6526{
6527 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6528# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6529# if RT_INLINE_ASM_GNU_STYLE
6530 __asm__ __volatile__("lock; btrl %1, %0"
6531 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6532 : "Ir" (iBit)
6533 , "m" (*(volatile long RT_FAR *)pvBitmap)
6534 : "memory"
6535 , "cc");
6536# else
6537 __asm
6538 {
6539# ifdef RT_ARCH_AMD64
6540 mov rax, [pvBitmap]
6541 mov edx, [iBit]
6542 lock btr [rax], edx
6543# else
6544 mov eax, [pvBitmap]
6545 mov edx, [iBit]
6546 lock btr [eax], edx
6547# endif
6548 }
6549# endif
6550# else
6551 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6552# endif
6553}
6554#endif
6555
6556
6557/**
6558 * Toggles a bit in a bitmap.
6559 *
6560 * @param pvBitmap Pointer to the bitmap (little endian).
6561 * @param iBit The bit to toggle.
6562 *
6563 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6564 * However, doing so will yield better performance as well as avoiding
6565 * traps accessing the last bits in the bitmap.
6566 */
6567#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6568RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6569#else
6570DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6571{
6572# if RT_INLINE_ASM_USES_INTRIN
6573 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
6574# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6575# if RT_INLINE_ASM_GNU_STYLE
6576 __asm__ __volatile__("btcl %1, %0"
6577 : "=m" (*(volatile long *)pvBitmap)
6578 : "Ir" (iBit)
6579 , "m" (*(volatile long *)pvBitmap)
6580 : "memory"
6581 , "cc");
6582# else
6583 __asm
6584 {
6585# ifdef RT_ARCH_AMD64
6586 mov rax, [pvBitmap]
6587 mov edx, [iBit]
6588 btc [rax], edx
6589# else
6590 mov eax, [pvBitmap]
6591 mov edx, [iBit]
6592 btc [eax], edx
6593# endif
6594 }
6595# endif
6596# else
6597 int32_t offBitmap = iBit / 32;
6598 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6599 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6600# endif
6601}
6602#endif
6603
6604
6605/**
6606 * Atomically toggles a bit in a bitmap, ordered.
6607 *
6608 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6609 * aligned, otherwise the memory access isn't atomic!
6610 * @param iBit The bit to test and set.
6611 *
6612 * @remarks x86: Requires a 386 or later.
6613 */
6614#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6615RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6616#else
6617DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6618{
6619 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6620# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6621# if RT_INLINE_ASM_GNU_STYLE
6622 __asm__ __volatile__("lock; btcl %1, %0"
6623 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6624 : "Ir" (iBit)
6625 , "m" (*(volatile long RT_FAR *)pvBitmap)
6626 : "memory"
6627 , "cc");
6628# else
6629 __asm
6630 {
6631# ifdef RT_ARCH_AMD64
6632 mov rax, [pvBitmap]
6633 mov edx, [iBit]
6634 lock btc [rax], edx
6635# else
6636 mov eax, [pvBitmap]
6637 mov edx, [iBit]
6638 lock btc [eax], edx
6639# endif
6640 }
6641# endif
6642# else
6643 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6644# endif
6645}
6646#endif
6647
6648
6649/**
6650 * Tests and sets a bit in a bitmap.
6651 *
6652 * @returns true if the bit was set.
6653 * @returns false if the bit was clear.
6654 *
6655 * @param pvBitmap Pointer to the bitmap (little endian).
6656 * @param iBit The bit to test and set.
6657 *
6658 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6659 * However, doing so will yield better performance as well as avoiding
6660 * traps accessing the last bits in the bitmap.
6661 */
6662#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6663RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6664#else
6665DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6666{
6667 union { bool f; uint32_t u32; uint8_t u8; } rc;
6668# if RT_INLINE_ASM_USES_INTRIN
6669 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
6670
6671# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6672# if RT_INLINE_ASM_GNU_STYLE
6673 __asm__ __volatile__("btsl %2, %1\n\t"
6674 "setc %b0\n\t"
6675 "andl $1, %0\n\t"
6676 : "=q" (rc.u32)
6677 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6678 : "Ir" (iBit)
6679 , "m" (*(volatile long RT_FAR *)pvBitmap)
6680 : "memory"
6681 , "cc");
6682# else
6683 __asm
6684 {
6685 mov edx, [iBit]
6686# ifdef RT_ARCH_AMD64
6687 mov rax, [pvBitmap]
6688 bts [rax], edx
6689# else
6690 mov eax, [pvBitmap]
6691 bts [eax], edx
6692# endif
6693 setc al
6694 and eax, 1
6695 mov [rc.u32], eax
6696 }
6697# endif
6698
6699# else
6700 int32_t offBitmap = iBit / 32;
6701 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6702 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6703 >> (iBit & 31);
6704 rc.u32 &= 1;
6705# endif
6706 return rc.f;
6707}
6708#endif
6709
6710
6711/**
6712 * Atomically tests and sets a bit in a bitmap, ordered.
6713 *
6714 * @returns true if the bit was set.
6715 * @returns false if the bit was clear.
6716 *
6717 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6718 * aligned, otherwise the memory access isn't atomic!
6719 * @param iBit The bit to set.
6720 *
6721 * @remarks x86: Requires a 386 or later.
6722 */
6723#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6724RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6725#else
6726DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6727{
6728 union { bool f; uint32_t u32; uint8_t u8; } rc;
6729 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6730# if RT_INLINE_ASM_USES_INTRIN
6731 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
6732# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6733# if RT_INLINE_ASM_GNU_STYLE
6734 __asm__ __volatile__("lock; btsl %2, %1\n\t"
6735 "setc %b0\n\t"
6736 "andl $1, %0\n\t"
6737 : "=q" (rc.u32)
6738 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6739 : "Ir" (iBit)
6740 , "m" (*(volatile long RT_FAR *)pvBitmap)
6741 : "memory"
6742 , "cc");
6743# else
6744 __asm
6745 {
6746 mov edx, [iBit]
6747# ifdef RT_ARCH_AMD64
6748 mov rax, [pvBitmap]
6749 lock bts [rax], edx
6750# else
6751 mov eax, [pvBitmap]
6752 lock bts [eax], edx
6753# endif
6754 setc al
6755 and eax, 1
6756 mov [rc.u32], eax
6757 }
6758# endif
6759
6760# else
6761 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6762 >> (iBit & 31);
6763 rc.u32 &= 1;
6764# endif
6765 return rc.f;
6766}
6767#endif
6768
6769
6770/**
6771 * Tests and clears a bit in a bitmap.
6772 *
6773 * @returns true if the bit was set.
6774 * @returns false if the bit was clear.
6775 *
6776 * @param pvBitmap Pointer to the bitmap (little endian).
6777 * @param iBit The bit to test and clear.
6778 *
6779 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6780 * However, doing so will yield better performance as well as avoiding
6781 * traps accessing the last bits in the bitmap.
6782 */
6783#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6784RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6785#else
6786DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6787{
6788 union { bool f; uint32_t u32; uint8_t u8; } rc;
6789# if RT_INLINE_ASM_USES_INTRIN
6790 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
6791
6792# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6793# if RT_INLINE_ASM_GNU_STYLE
6794 __asm__ __volatile__("btrl %2, %1\n\t"
6795 "setc %b0\n\t"
6796 "andl $1, %0\n\t"
6797 : "=q" (rc.u32)
6798 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6799 : "Ir" (iBit)
6800 , "m" (*(volatile long RT_FAR *)pvBitmap)
6801 : "memory"
6802 , "cc");
6803# else
6804 __asm
6805 {
6806 mov edx, [iBit]
6807# ifdef RT_ARCH_AMD64
6808 mov rax, [pvBitmap]
6809 btr [rax], edx
6810# else
6811 mov eax, [pvBitmap]
6812 btr [eax], edx
6813# endif
6814 setc al
6815 and eax, 1
6816 mov [rc.u32], eax
6817 }
6818# endif
6819
6820# else
6821 int32_t offBitmap = iBit / 32;
6822 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6823 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
6824 >> (iBit & 31);
6825 rc.u32 &= 1;
6826# endif
6827 return rc.f;
6828}
6829#endif
6830
6831
6832/**
6833 * Atomically tests and clears a bit in a bitmap, ordered.
6834 *
6835 * @returns true if the bit was set.
6836 * @returns false if the bit was clear.
6837 *
6838 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6839 * aligned, otherwise the memory access isn't atomic!
6840 * @param iBit The bit to test and clear.
6841 *
6842 * @remarks No memory barrier, take care on smp.
6843 * @remarks x86: Requires a 386 or later.
6844 */
6845#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6846RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6847#else
6848DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6849{
6850 union { bool f; uint32_t u32; uint8_t u8; } rc;
6851 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6852# if RT_INLINE_ASM_USES_INTRIN
6853 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
6854
6855# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6856# if RT_INLINE_ASM_GNU_STYLE
6857 __asm__ __volatile__("lock; btrl %2, %1\n\t"
6858 "setc %b0\n\t"
6859 "andl $1, %0\n\t"
6860 : "=q" (rc.u32)
6861 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6862 : "Ir" (iBit)
6863 , "m" (*(volatile long RT_FAR *)pvBitmap)
6864 : "memory"
6865 , "cc");
6866# else
6867 __asm
6868 {
6869 mov edx, [iBit]
6870# ifdef RT_ARCH_AMD64
6871 mov rax, [pvBitmap]
6872 lock btr [rax], edx
6873# else
6874 mov eax, [pvBitmap]
6875 lock btr [eax], edx
6876# endif
6877 setc al
6878 and eax, 1
6879 mov [rc.u32], eax
6880 }
6881# endif
6882
6883# else
6884 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
6885 >> (iBit & 31);
6886 rc.u32 &= 1;
6887# endif
6888 return rc.f;
6889}
6890#endif
6891
6892
6893/**
6894 * Tests and toggles a bit in a bitmap.
6895 *
6896 * @returns true if the bit was set.
6897 * @returns false if the bit was clear.
6898 *
6899 * @param pvBitmap Pointer to the bitmap (little endian).
6900 * @param iBit The bit to test and toggle.
6901 *
6902 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6903 * However, doing so will yield better performance as well as avoiding
6904 * traps accessing the last bits in the bitmap.
6905 */
6906#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6907RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6908#else
6909DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6910{
6911 union { bool f; uint32_t u32; uint8_t u8; } rc;
6912# if RT_INLINE_ASM_USES_INTRIN
6913 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
6914
6915# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6916# if RT_INLINE_ASM_GNU_STYLE
6917 __asm__ __volatile__("btcl %2, %1\n\t"
6918 "setc %b0\n\t"
6919 "andl $1, %0\n\t"
6920 : "=q" (rc.u32)
6921 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6922 : "Ir" (iBit)
6923 , "m" (*(volatile long RT_FAR *)pvBitmap)
6924 : "memory"
6925 , "cc");
6926# else
6927 __asm
6928 {
6929 mov edx, [iBit]
6930# ifdef RT_ARCH_AMD64
6931 mov rax, [pvBitmap]
6932 btc [rax], edx
6933# else
6934 mov eax, [pvBitmap]
6935 btc [eax], edx
6936# endif
6937 setc al
6938 and eax, 1
6939 mov [rc.u32], eax
6940 }
6941# endif
6942
6943# else
6944 int32_t offBitmap = iBit / 32;
6945 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6946 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6947 >> (iBit & 31);
6948 rc.u32 &= 1;
6949# endif
6950 return rc.f;
6951}
6952#endif
6953
6954
6955/**
6956 * Atomically tests and toggles a bit in a bitmap, ordered.
6957 *
6958 * @returns true if the bit was set.
6959 * @returns false if the bit was clear.
6960 *
6961 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6962 * aligned, otherwise the memory access isn't atomic!
6963 * @param iBit The bit to test and toggle.
6964 *
6965 * @remarks x86: Requires a 386 or later.
6966 */
6967#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6968RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6969#else
6970DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6971{
6972 union { bool f; uint32_t u32; uint8_t u8; } rc;
6973 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6974# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6975# if RT_INLINE_ASM_GNU_STYLE
6976 __asm__ __volatile__("lock; btcl %2, %1\n\t"
6977 "setc %b0\n\t"
6978 "andl $1, %0\n\t"
6979 : "=q" (rc.u32)
6980 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6981 : "Ir" (iBit)
6982 , "m" (*(volatile long RT_FAR *)pvBitmap)
6983 : "memory"
6984 , "cc");
6985# else
6986 __asm
6987 {
6988 mov edx, [iBit]
6989# ifdef RT_ARCH_AMD64
6990 mov rax, [pvBitmap]
6991 lock btc [rax], edx
6992# else
6993 mov eax, [pvBitmap]
6994 lock btc [eax], edx
6995# endif
6996 setc al
6997 and eax, 1
6998 mov [rc.u32], eax
6999 }
7000# endif
7001
7002# else
7003 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
7004 >> (iBit & 31);
7005 rc.u32 &= 1;
7006# endif
7007 return rc.f;
7008}
7009#endif
7010
7011
7012/**
7013 * Tests if a bit in a bitmap is set.
7014 *
7015 * @returns true if the bit is set.
7016 * @returns false if the bit is clear.
7017 *
7018 * @param pvBitmap Pointer to the bitmap (little endian).
7019 * @param iBit The bit to test.
7020 *
7021 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7022 * However, doing so will yield better performance as well as avoiding
7023 * traps accessing the last bits in the bitmap.
7024 */
7025#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7026RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7027#else
7028DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7029{
7030 union { bool f; uint32_t u32; uint8_t u8; } rc;
7031# if RT_INLINE_ASM_USES_INTRIN
7032 rc.u32 = _bittest((long *)pvBitmap, iBit);
7033
7034# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7035# if RT_INLINE_ASM_GNU_STYLE
7036
7037 __asm__ __volatile__("btl %2, %1\n\t"
7038 "setc %b0\n\t"
7039 "andl $1, %0\n\t"
7040 : "=q" (rc.u32)
7041 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7042 , "Ir" (iBit)
7043 : "memory"
7044 , "cc");
7045# else
7046 __asm
7047 {
7048 mov edx, [iBit]
7049# ifdef RT_ARCH_AMD64
7050 mov rax, [pvBitmap]
7051 bt [rax], edx
7052# else
7053 mov eax, [pvBitmap]
7054 bt [eax], edx
7055# endif
7056 setc al
7057 and eax, 1
7058 mov [rc.u32], eax
7059 }
7060# endif
7061
7062# else
7063 int32_t offBitmap = iBit / 32;
7064 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7065 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7066 rc.u32 &= 1;
7067# endif
7068 return rc.f;
7069}
7070#endif
7071
7072
7073#ifdef IPRT_INCLUDED_asm_mem_h
7074
7075/**
7076 * Clears a bit range within a bitmap.
7077 *
7078 * @param pvBitmap Pointer to the bitmap (little endian).
7079 * @param iBitStart The First bit to clear.
7080 * @param iBitEnd The first bit not to clear.
7081 */
7082DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7083{
7084 if (iBitStart < iBitEnd)
7085 {
7086 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7087 size_t iStart = iBitStart & ~(size_t)31;
7088 size_t iEnd = iBitEnd & ~(size_t)31;
7089 if (iStart == iEnd)
7090 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7091 else
7092 {
7093 /* bits in first dword. */
7094 if (iBitStart & 31)
7095 {
7096 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7097 pu32++;
7098 iBitStart = iStart + 32;
7099 }
7100
7101 /* whole dwords. */
7102 if (iBitStart != iEnd)
7103 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7104
7105 /* bits in last dword. */
7106 if (iBitEnd & 31)
7107 {
7108 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7109 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7110 }
7111 }
7112 }
7113}
7114
7115
7116/**
7117 * Sets a bit range within a bitmap.
7118 *
7119 * @param pvBitmap Pointer to the bitmap (little endian).
7120 * @param iBitStart The First bit to set.
7121 * @param iBitEnd The first bit not to set.
7122 */
7123DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7124{
7125 if (iBitStart < iBitEnd)
7126 {
7127 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7128 size_t iStart = iBitStart & ~(size_t)31;
7129 size_t iEnd = iBitEnd & ~(size_t)31;
7130 if (iStart == iEnd)
7131 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7132 else
7133 {
7134 /* bits in first dword. */
7135 if (iBitStart & 31)
7136 {
7137 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7138 pu32++;
7139 iBitStart = iStart + 32;
7140 }
7141
7142 /* whole dword. */
7143 if (iBitStart != iEnd)
7144 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7145
7146 /* bits in last dword. */
7147 if (iBitEnd & 31)
7148 {
7149 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7150 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7151 }
7152 }
7153 }
7154}
7155
7156#endif /* IPRT_INCLUDED_asm_mem_h */
7157
7158/**
7159 * Finds the first clear bit in a bitmap.
7160 *
7161 * @returns Index of the first zero bit.
7162 * @returns -1 if no clear bit was found.
7163 * @param pvBitmap Pointer to the bitmap (little endian).
7164 * @param cBits The number of bits in the bitmap. Multiple of 32.
7165 */
7166#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7167DECLASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7168#else
7169DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7170{
7171 if (cBits)
7172 {
7173 int32_t iBit;
7174# if RT_INLINE_ASM_GNU_STYLE
7175 RTCCUINTREG uEAX, uECX, uEDI;
7176 cBits = RT_ALIGN_32(cBits, 32);
7177 __asm__ __volatile__("repe; scasl\n\t"
7178 "je 1f\n\t"
7179# ifdef RT_ARCH_AMD64
7180 "lea -4(%%rdi), %%rdi\n\t"
7181 "xorl (%%rdi), %%eax\n\t"
7182 "subq %5, %%rdi\n\t"
7183# else
7184 "lea -4(%%edi), %%edi\n\t"
7185 "xorl (%%edi), %%eax\n\t"
7186 "subl %5, %%edi\n\t"
7187# endif
7188 "shll $3, %%edi\n\t"
7189 "bsfl %%eax, %%edx\n\t"
7190 "addl %%edi, %%edx\n\t"
7191 "1:\t\n"
7192 : "=d" (iBit)
7193 , "=&c" (uECX)
7194 , "=&D" (uEDI)
7195 , "=&a" (uEAX)
7196 : "0" (0xffffffff)
7197 , "mr" (pvBitmap)
7198 , "1" (cBits >> 5)
7199 , "2" (pvBitmap)
7200 , "3" (0xffffffff)
7201 : "cc");
7202# else
7203 cBits = RT_ALIGN_32(cBits, 32);
7204 __asm
7205 {
7206# ifdef RT_ARCH_AMD64
7207 mov rdi, [pvBitmap]
7208 mov rbx, rdi
7209# else
7210 mov edi, [pvBitmap]
7211 mov ebx, edi
7212# endif
7213 mov edx, 0ffffffffh
7214 mov eax, edx
7215 mov ecx, [cBits]
7216 shr ecx, 5
7217 repe scasd
7218 je done
7219
7220# ifdef RT_ARCH_AMD64
7221 lea rdi, [rdi - 4]
7222 xor eax, [rdi]
7223 sub rdi, rbx
7224# else
7225 lea edi, [edi - 4]
7226 xor eax, [edi]
7227 sub edi, ebx
7228# endif
7229 shl edi, 3
7230 bsf edx, eax
7231 add edx, edi
7232 done:
7233 mov [iBit], edx
7234 }
7235# endif
7236 return iBit;
7237 }
7238 return -1;
7239}
7240#endif
7241
7242
7243/**
7244 * Finds the next clear bit in a bitmap.
7245 *
7246 * @returns Index of the first zero bit.
7247 * @returns -1 if no clear bit was found.
7248 * @param pvBitmap Pointer to the bitmap (little endian).
7249 * @param cBits The number of bits in the bitmap. Multiple of 32.
7250 * @param iBitPrev The bit returned from the last search.
7251 * The search will start at iBitPrev + 1.
7252 */
7253#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7254DECLASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7255#else
7256DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7257{
7258 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7259 int iBit = ++iBitPrev & 31;
7260 if (iBit)
7261 {
7262 /*
7263 * Inspect the 32-bit word containing the unaligned bit.
7264 */
7265 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7266
7267# if RT_INLINE_ASM_USES_INTRIN
7268 unsigned long ulBit = 0;
7269 if (_BitScanForward(&ulBit, u32))
7270 return ulBit + iBitPrev;
7271# else
7272# if RT_INLINE_ASM_GNU_STYLE
7273 __asm__ __volatile__("bsf %1, %0\n\t"
7274 "jnz 1f\n\t"
7275 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7276 "1:\n\t"
7277 : "=r" (iBit)
7278 : "r" (u32)
7279 : "cc");
7280# else
7281 __asm
7282 {
7283 mov edx, [u32]
7284 bsf eax, edx
7285 jnz done
7286 mov eax, 0ffffffffh
7287 done:
7288 mov [iBit], eax
7289 }
7290# endif
7291 if (iBit >= 0)
7292 return iBit + (int)iBitPrev;
7293# endif
7294
7295 /*
7296 * Skip ahead and see if there is anything left to search.
7297 */
7298 iBitPrev |= 31;
7299 iBitPrev++;
7300 if (cBits <= (uint32_t)iBitPrev)
7301 return -1;
7302 }
7303
7304 /*
7305 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7306 */
7307 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7308 if (iBit >= 0)
7309 iBit += iBitPrev;
7310 return iBit;
7311}
7312#endif
7313
7314
7315/**
7316 * Finds the first set bit in a bitmap.
7317 *
7318 * @returns Index of the first set bit.
7319 * @returns -1 if no clear bit was found.
7320 * @param pvBitmap Pointer to the bitmap (little endian).
7321 * @param cBits The number of bits in the bitmap. Multiple of 32.
7322 */
7323#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7324DECLASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7325#else
7326DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7327{
7328 if (cBits)
7329 {
7330 int32_t iBit;
7331# if RT_INLINE_ASM_GNU_STYLE
7332 RTCCUINTREG uEAX, uECX, uEDI;
7333 cBits = RT_ALIGN_32(cBits, 32);
7334 __asm__ __volatile__("repe; scasl\n\t"
7335 "je 1f\n\t"
7336# ifdef RT_ARCH_AMD64
7337 "lea -4(%%rdi), %%rdi\n\t"
7338 "movl (%%rdi), %%eax\n\t"
7339 "subq %5, %%rdi\n\t"
7340# else
7341 "lea -4(%%edi), %%edi\n\t"
7342 "movl (%%edi), %%eax\n\t"
7343 "subl %5, %%edi\n\t"
7344# endif
7345 "shll $3, %%edi\n\t"
7346 "bsfl %%eax, %%edx\n\t"
7347 "addl %%edi, %%edx\n\t"
7348 "1:\t\n"
7349 : "=d" (iBit)
7350 , "=&c" (uECX)
7351 , "=&D" (uEDI)
7352 , "=&a" (uEAX)
7353 : "0" (0xffffffff)
7354 , "mr" (pvBitmap)
7355 , "1" (cBits >> 5)
7356 , "2" (pvBitmap)
7357 , "3" (0)
7358 : "cc");
7359# else
7360 cBits = RT_ALIGN_32(cBits, 32);
7361 __asm
7362 {
7363# ifdef RT_ARCH_AMD64
7364 mov rdi, [pvBitmap]
7365 mov rbx, rdi
7366# else
7367 mov edi, [pvBitmap]
7368 mov ebx, edi
7369# endif
7370 mov edx, 0ffffffffh
7371 xor eax, eax
7372 mov ecx, [cBits]
7373 shr ecx, 5
7374 repe scasd
7375 je done
7376# ifdef RT_ARCH_AMD64
7377 lea rdi, [rdi - 4]
7378 mov eax, [rdi]
7379 sub rdi, rbx
7380# else
7381 lea edi, [edi - 4]
7382 mov eax, [edi]
7383 sub edi, ebx
7384# endif
7385 shl edi, 3
7386 bsf edx, eax
7387 add edx, edi
7388 done:
7389 mov [iBit], edx
7390 }
7391# endif
7392 return iBit;
7393 }
7394 return -1;
7395}
7396#endif
7397
7398
7399/**
7400 * Finds the next set bit in a bitmap.
7401 *
7402 * @returns Index of the next set bit.
7403 * @returns -1 if no set bit was found.
7404 * @param pvBitmap Pointer to the bitmap (little endian).
7405 * @param cBits The number of bits in the bitmap. Multiple of 32.
7406 * @param iBitPrev The bit returned from the last search.
7407 * The search will start at iBitPrev + 1.
7408 */
7409#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7410DECLASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7411#else
7412DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7413{
7414 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7415 int iBit = ++iBitPrev & 31;
7416 if (iBit)
7417 {
7418 /*
7419 * Inspect the 32-bit word containing the unaligned bit.
7420 */
7421 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
7422
7423# if RT_INLINE_ASM_USES_INTRIN
7424 unsigned long ulBit = 0;
7425 if (_BitScanForward(&ulBit, u32))
7426 return ulBit + iBitPrev;
7427# else
7428# if RT_INLINE_ASM_GNU_STYLE
7429 __asm__ __volatile__("bsf %1, %0\n\t"
7430 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
7431 "movl $-1, %0\n\t"
7432 "1:\n\t"
7433 : "=r" (iBit)
7434 : "r" (u32)
7435 : "cc");
7436# else
7437 __asm
7438 {
7439 mov edx, [u32]
7440 bsf eax, edx
7441 jnz done
7442 mov eax, 0ffffffffh
7443 done:
7444 mov [iBit], eax
7445 }
7446# endif
7447 if (iBit >= 0)
7448 return iBit + (int)iBitPrev;
7449# endif
7450
7451 /*
7452 * Skip ahead and see if there is anything left to search.
7453 */
7454 iBitPrev |= 31;
7455 iBitPrev++;
7456 if (cBits <= (uint32_t)iBitPrev)
7457 return -1;
7458 }
7459
7460 /*
7461 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7462 */
7463 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7464 if (iBit >= 0)
7465 iBit += iBitPrev;
7466 return iBit;
7467}
7468#endif
7469
7470/** @} */
7471
7472
7473/** @defgroup grp_inline_bits Bitmap Operations
7474 * @{
7475 */
7476
7477/**
7478 * Finds the first bit which is set in the given 32-bit integer.
7479 * Bits are numbered from 1 (least significant) to 32.
7480 *
7481 * @returns index [1..32] of the first set bit.
7482 * @returns 0 if all bits are cleared.
7483 * @param u32 Integer to search for set bits.
7484 * @remarks Similar to ffs() in BSD.
7485 */
7486#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7487RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7488#else
7489DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
7490{
7491# if RT_INLINE_ASM_USES_INTRIN
7492 unsigned long iBit;
7493 if (_BitScanForward(&iBit, u32))
7494 iBit++;
7495 else
7496 iBit = 0;
7497
7498# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7499# if RT_INLINE_ASM_GNU_STYLE
7500 uint32_t iBit;
7501 __asm__ __volatile__("bsf %1, %0\n\t"
7502 "jnz 1f\n\t"
7503 "xorl %0, %0\n\t"
7504 "jmp 2f\n"
7505 "1:\n\t"
7506 "incl %0\n"
7507 "2:\n\t"
7508 : "=r" (iBit)
7509 : "rm" (u32)
7510 : "cc");
7511# else
7512 uint32_t iBit;
7513 _asm
7514 {
7515 bsf eax, [u32]
7516 jnz found
7517 xor eax, eax
7518 jmp done
7519 found:
7520 inc eax
7521 done:
7522 mov [iBit], eax
7523 }
7524# endif
7525
7526# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7527 /*
7528 * Using the "count leading zeros (clz)" instruction here because there
7529 * is no dedicated instruction to get the first set bit.
7530 * Need to reverse the bits in the value with "rbit" first because
7531 * "clz" starts counting from the most significant bit.
7532 */
7533 uint32_t iBit;
7534 __asm__ __volatile__(
7535# if defined(RT_ARCH_ARM64)
7536 "rbit %w[uVal], %w[uVal]\n\t"
7537 "clz %w[iBit], %w[uVal]\n\t"
7538# else
7539 "rbit %[uVal], %[uVal]\n\t"
7540 "clz %[iBit], %[uVal]\n\t"
7541# endif
7542 : [uVal] "=r" (u32)
7543 , [iBit] "=r" (iBit)
7544 : "[uVal]" (u32));
7545 if (iBit != 32)
7546 iBit++;
7547 else
7548 iBit = 0; /* No bit set. */
7549
7550# else
7551# error "Port me"
7552# endif
7553 return iBit;
7554}
7555#endif
7556
7557
7558/**
7559 * Finds the first bit which is set in the given 32-bit integer.
7560 * Bits are numbered from 1 (least significant) to 32.
7561 *
7562 * @returns index [1..32] of the first set bit.
7563 * @returns 0 if all bits are cleared.
7564 * @param i32 Integer to search for set bits.
7565 * @remark Similar to ffs() in BSD.
7566 */
7567DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
7568{
7569 return ASMBitFirstSetU32((uint32_t)i32);
7570}
7571
7572
7573/**
7574 * Finds the first bit which is set in the given 64-bit integer.
7575 *
7576 * Bits are numbered from 1 (least significant) to 64.
7577 *
7578 * @returns index [1..64] of the first set bit.
7579 * @returns 0 if all bits are cleared.
7580 * @param u64 Integer to search for set bits.
7581 * @remarks Similar to ffs() in BSD.
7582 */
7583#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7584RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
7585#else
7586DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
7587{
7588# if RT_INLINE_ASM_USES_INTRIN
7589 unsigned long iBit;
7590# if ARCH_BITS == 64
7591 if (_BitScanForward64(&iBit, u64))
7592 iBit++;
7593 else
7594 iBit = 0;
7595# else
7596 if (_BitScanForward(&iBit, (uint32_t)u64))
7597 iBit++;
7598 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
7599 iBit += 33;
7600 else
7601 iBit = 0;
7602# endif
7603
7604# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7605 uint64_t iBit;
7606 __asm__ __volatile__("bsfq %1, %0\n\t"
7607 "jnz 1f\n\t"
7608 "xorl %k0, %k0\n\t"
7609 "jmp 2f\n"
7610 "1:\n\t"
7611 "incl %k0\n"
7612 "2:\n\t"
7613 : "=r" (iBit)
7614 : "rm" (u64)
7615 : "cc");
7616
7617# elif defined(RT_ARCH_ARM64)
7618 uint64_t iBit;
7619 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
7620 "clz %[iBit], %[uVal]\n\t"
7621 : [uVal] "=r" (u64)
7622 , [iBit] "=r" (iBit)
7623 : "[uVal]" (u64));
7624 if (iBit != 64)
7625 iBit++;
7626 else
7627 iBit = 0; /* No bit set. */
7628
7629# else
7630 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
7631 if (!iBit)
7632 {
7633 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
7634 if (iBit)
7635 iBit += 32;
7636 }
7637# endif
7638 return (unsigned)iBit;
7639}
7640#endif
7641
7642
7643/**
7644 * Finds the first bit which is set in the given 16-bit integer.
7645 *
7646 * Bits are numbered from 1 (least significant) to 16.
7647 *
7648 * @returns index [1..16] of the first set bit.
7649 * @returns 0 if all bits are cleared.
7650 * @param u16 Integer to search for set bits.
7651 * @remarks For 16-bit bs3kit code.
7652 */
7653#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7654RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
7655#else
7656DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
7657{
7658 return ASMBitFirstSetU32((uint32_t)u16);
7659}
7660#endif
7661
7662
7663/**
7664 * Finds the last bit which is set in the given 32-bit integer.
7665 * Bits are numbered from 1 (least significant) to 32.
7666 *
7667 * @returns index [1..32] of the last set bit.
7668 * @returns 0 if all bits are cleared.
7669 * @param u32 Integer to search for set bits.
7670 * @remark Similar to fls() in BSD.
7671 */
7672#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7673RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7674#else
7675DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
7676{
7677# if RT_INLINE_ASM_USES_INTRIN
7678 unsigned long iBit;
7679 if (_BitScanReverse(&iBit, u32))
7680 iBit++;
7681 else
7682 iBit = 0;
7683
7684# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7685# if RT_INLINE_ASM_GNU_STYLE
7686 uint32_t iBit;
7687 __asm__ __volatile__("bsrl %1, %0\n\t"
7688 "jnz 1f\n\t"
7689 "xorl %0, %0\n\t"
7690 "jmp 2f\n"
7691 "1:\n\t"
7692 "incl %0\n"
7693 "2:\n\t"
7694 : "=r" (iBit)
7695 : "rm" (u32)
7696 : "cc");
7697# else
7698 uint32_t iBit;
7699 _asm
7700 {
7701 bsr eax, [u32]
7702 jnz found
7703 xor eax, eax
7704 jmp done
7705 found:
7706 inc eax
7707 done:
7708 mov [iBit], eax
7709 }
7710# endif
7711
7712# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7713 uint32_t iBit;
7714 __asm__ __volatile__(
7715# if defined(RT_ARCH_ARM64)
7716 "clz %w[iBit], %w[uVal]\n\t"
7717# else
7718 "clz %[iBit], %[uVal]\n\t"
7719# endif
7720 : [iBit] "=r" (iBit)
7721 : [uVal] "r" (u32));
7722 iBit = 32 - iBit;
7723
7724# else
7725# error "Port me"
7726# endif
7727 return iBit;
7728}
7729#endif
7730
7731
7732/**
7733 * Finds the last bit which is set in the given 32-bit integer.
7734 * Bits are numbered from 1 (least significant) to 32.
7735 *
7736 * @returns index [1..32] of the last set bit.
7737 * @returns 0 if all bits are cleared.
7738 * @param i32 Integer to search for set bits.
7739 * @remark Similar to fls() in BSD.
7740 */
7741DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
7742{
7743 return ASMBitLastSetU32((uint32_t)i32);
7744}
7745
7746
7747/**
7748 * Finds the last bit which is set in the given 64-bit integer.
7749 *
7750 * Bits are numbered from 1 (least significant) to 64.
7751 *
7752 * @returns index [1..64] of the last set bit.
7753 * @returns 0 if all bits are cleared.
7754 * @param u64 Integer to search for set bits.
7755 * @remark Similar to fls() in BSD.
7756 */
7757#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7758RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
7759#else
7760DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
7761{
7762# if RT_INLINE_ASM_USES_INTRIN
7763 unsigned long iBit;
7764# if ARCH_BITS == 64
7765 if (_BitScanReverse64(&iBit, u64))
7766 iBit++;
7767 else
7768 iBit = 0;
7769# else
7770 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
7771 iBit += 33;
7772 else if (_BitScanReverse(&iBit, (uint32_t)u64))
7773 iBit++;
7774 else
7775 iBit = 0;
7776# endif
7777
7778# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7779 uint64_t iBit;
7780 __asm__ __volatile__("bsrq %1, %0\n\t"
7781 "jnz 1f\n\t"
7782 "xorl %k0, %k0\n\t"
7783 "jmp 2f\n"
7784 "1:\n\t"
7785 "incl %k0\n"
7786 "2:\n\t"
7787 : "=r" (iBit)
7788 : "rm" (u64)
7789 : "cc");
7790
7791# elif defined(RT_ARCH_ARM64)
7792 uint64_t iBit;
7793 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
7794 : [iBit] "=r" (iBit)
7795 : [uVal] "r" (u64));
7796 iBit = 64 - iBit;
7797
7798# else
7799 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
7800 if (iBit)
7801 iBit += 32;
7802 else
7803 iBit = ASMBitLastSetU32((uint32_t)u64);
7804# endif
7805 return (unsigned)iBit;
7806}
7807#endif
7808
7809
7810/**
7811 * Finds the last bit which is set in the given 16-bit integer.
7812 *
7813 * Bits are numbered from 1 (least significant) to 16.
7814 *
7815 * @returns index [1..16] of the last set bit.
7816 * @returns 0 if all bits are cleared.
7817 * @param u16 Integer to search for set bits.
7818 * @remarks For 16-bit bs3kit code.
7819 */
7820#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7821RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
7822#else
7823DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
7824{
7825 return ASMBitLastSetU32((uint32_t)u16);
7826}
7827#endif
7828
7829
7830/**
7831 * Count the number of leading zero bits in the given 32-bit integer.
7832 *
7833 * The counting starts with the most significate bit.
7834 *
7835 * @returns Number of most significant zero bits.
7836 * @returns 32 if all bits are cleared.
7837 * @param u32 Integer to consider.
7838 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
7839 */
7840#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7841RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
7842#else
7843DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
7844{
7845# if RT_INLINE_ASM_USES_INTRIN
7846 unsigned long iBit;
7847 if (!_BitScanReverse(&iBit, u32))
7848 return 32;
7849 return 31 - (unsigned)iBit;
7850
7851# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7852 uint32_t iBit;
7853# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
7854 __asm__ __volatile__("bsrl %1, %0\n\t"
7855 "cmovzl %2, %0\n\t"
7856 : "=&r" (iBit)
7857 : "rm" (u32)
7858 , "rm" ((int32_t)-1)
7859 : "cc");
7860# elif RT_INLINE_ASM_GNU_STYLE
7861 __asm__ __volatile__("bsr %1, %0\n\t"
7862 "jnz 1f\n\t"
7863 "mov $-1, %0\n\t"
7864 "1:\n\t"
7865 : "=r" (iBit)
7866 : "rm" (u32)
7867 : "cc");
7868# else
7869 _asm
7870 {
7871 bsr eax, [u32]
7872 jnz found
7873 mov eax, -1
7874 found:
7875 mov [iBit], eax
7876 }
7877# endif
7878 return 31 - iBit;
7879
7880# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7881 uint32_t iBit;
7882 __asm__ __volatile__(
7883# if defined(RT_ARCH_ARM64)
7884 "clz %w[iBit], %w[uVal]\n\t"
7885# else
7886 "clz %[iBit], %[uVal]\n\t"
7887# endif
7888 : [uVal] "=r" (u32)
7889 , [iBit] "=r" (iBit)
7890 : "[uVal]" (u32));
7891 return iBit;
7892
7893# elif defined(__GNUC__)
7894 AssertCompile(sizeof(u32) == sizeof(unsigned int));
7895 return u32 ? __builtin_clz(u32) : 32;
7896
7897# else
7898# error "Port me"
7899# endif
7900}
7901#endif
7902
7903
7904/**
7905 * Count the number of leading zero bits in the given 64-bit integer.
7906 *
7907 * The counting starts with the most significate bit.
7908 *
7909 * @returns Number of most significant zero bits.
7910 * @returns 64 if all bits are cleared.
7911 * @param u64 Integer to consider.
7912 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
7913 * result.
7914 */
7915#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7916RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
7917#else
7918DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
7919{
7920# if RT_INLINE_ASM_USES_INTRIN
7921 unsigned long iBit;
7922# if ARCH_BITS == 64
7923 if (_BitScanReverse64(&iBit, u64))
7924 return 63 - (unsigned)iBit;
7925# else
7926 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
7927 return 31 - (unsigned)iBit;
7928 if (_BitScanReverse(&iBit, (uint32_t)u64))
7929 return 63 - (unsigned)iBit;
7930# endif
7931 return 64;
7932
7933# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7934 uint64_t iBit;
7935# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
7936 __asm__ __volatile__("bsrq %1, %0\n\t"
7937 "cmovzq %2, %0\n\t"
7938 : "=&r" (iBit)
7939 : "rm" (u64)
7940 , "rm" ((int64_t)-1)
7941 : "cc");
7942# else /* 10980xe benchmark: 262 ps/call */
7943 __asm__ __volatile__("bsrq %1, %0\n\t"
7944 "jnz 1f\n\t"
7945 "mov $-1, %0\n\t"
7946 "1:\n\t"
7947 : "=&r" (iBit)
7948 : "rm" (u64)
7949 : "cc");
7950# endif
7951 return 63 - (unsigned)iBit;
7952
7953# elif defined(RT_ARCH_ARM64)
7954 uint64_t iBit;
7955 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
7956 : [uVal] "=r" (u64)
7957 , [iBit] "=r" (iBit)
7958 : "[uVal]" (u64));
7959 return (unsigned)iBit;
7960
7961# elif defined(__GNUC__) && ARCH_BITS == 64
7962 AssertCompile(sizeof(u64) == sizeof(unsigned long));
7963 return u64 ? __builtin_clzl(u64) : 64;
7964
7965# else
7966 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
7967 if (iBit == 32)
7968 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
7969 return iBit;
7970# endif
7971}
7972#endif
7973
7974
7975/**
7976 * Count the number of leading zero bits in the given 16-bit integer.
7977 *
7978 * The counting starts with the most significate bit.
7979 *
7980 * @returns Number of most significant zero bits.
7981 * @returns 16 if all bits are cleared.
7982 * @param u16 Integer to consider.
7983 */
7984#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7985RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
7986#else
7987DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
7988{
7989# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
7990 uint16_t iBit;
7991 __asm__ __volatile__("bsrw %1, %0\n\t"
7992 "jnz 1f\n\t"
7993 "mov $-1, %0\n\t"
7994 "1:\n\t"
7995 : "=r" (iBit)
7996 : "rm" (u16)
7997 : "cc");
7998 return 15 - (int16_t)iBit;
7999# else
8000 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
8001# endif
8002}
8003#endif
8004
8005
8006/**
8007 * Count the number of trailing zero bits in the given 32-bit integer.
8008 *
8009 * The counting starts with the least significate bit, i.e. the zero bit.
8010 *
8011 * @returns Number of least significant zero bits.
8012 * @returns 32 if all bits are cleared.
8013 * @param u32 Integer to consider.
8014 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8015 */
8016#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8017RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8018#else
8019DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8020{
8021# if RT_INLINE_ASM_USES_INTRIN
8022 unsigned long iBit;
8023 if (!_BitScanForward(&iBit, u32))
8024 return 32;
8025 return (unsigned)iBit;
8026
8027# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8028 uint32_t iBit;
8029# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8030 __asm__ __volatile__("bsfl %1, %0\n\t"
8031 "cmovzl %2, %0\n\t"
8032 : "=&r" (iBit)
8033 : "rm" (u32)
8034 , "rm" ((int32_t)32)
8035 : "cc");
8036# elif RT_INLINE_ASM_GNU_STYLE
8037 __asm__ __volatile__("bsfl %1, %0\n\t"
8038 "jnz 1f\n\t"
8039 "mov $32, %0\n\t"
8040 "1:\n\t"
8041 : "=r" (iBit)
8042 : "rm" (u32)
8043 : "cc");
8044# else
8045 _asm
8046 {
8047 bsf eax, [u32]
8048 jnz found
8049 mov eax, 32
8050 found:
8051 mov [iBit], eax
8052 }
8053# endif
8054 return iBit;
8055
8056# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8057 /* Invert the bits and use clz. */
8058 uint32_t iBit;
8059 __asm__ __volatile__(
8060# if defined(RT_ARCH_ARM64)
8061 "rbit %w[uVal], %w[uVal]\n\t"
8062 "clz %w[iBit], %w[uVal]\n\t"
8063# else
8064 "rbit %[uVal], %[uVal]\n\t"
8065 "clz %[iBit], %[uVal]\n\t"
8066# endif
8067 : [uVal] "=r" (u32)
8068 , [iBit] "=r" (iBit)
8069 : "[uVal]" (u32));
8070 return iBit;
8071
8072# elif defined(__GNUC__)
8073 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8074 return u32 ? __builtin_ctz(u32) : 32;
8075
8076# else
8077# error "Port me"
8078# endif
8079}
8080#endif
8081
8082
8083/**
8084 * Count the number of trailing zero bits in the given 64-bit integer.
8085 *
8086 * The counting starts with the least significate bit.
8087 *
8088 * @returns Number of least significant zero bits.
8089 * @returns 64 if all bits are cleared.
8090 * @param u64 Integer to consider.
8091 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8092 * result.
8093 */
8094#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8095RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8096#else
8097DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8098{
8099# if RT_INLINE_ASM_USES_INTRIN
8100 unsigned long iBit;
8101# if ARCH_BITS == 64
8102 if (_BitScanForward64(&iBit, u64))
8103 return (unsigned)iBit;
8104# else
8105 if (_BitScanForward(&iBit, (uint32_t)u64))
8106 return (unsigned)iBit;
8107 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8108 return (unsigned)iBit + 32;
8109# endif
8110 return 64;
8111
8112# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8113 uint64_t iBit;
8114# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8115 __asm__ __volatile__("bsfq %1, %0\n\t"
8116 "cmovzq %2, %0\n\t"
8117 : "=&r" (iBit)
8118 : "rm" (u64)
8119 , "rm" ((int64_t)64)
8120 : "cc");
8121# else /* 10980xe benchmark: 262 ps/call */
8122 __asm__ __volatile__("bsfq %1, %0\n\t"
8123 "jnz 1f\n\t"
8124 "mov $64, %0\n\t"
8125 "1:\n\t"
8126 : "=&r" (iBit)
8127 : "rm" (u64)
8128 : "cc");
8129# endif
8130 return (unsigned)iBit;
8131
8132# elif defined(RT_ARCH_ARM64)
8133 /* Invert the bits and use clz. */
8134 uint64_t iBit;
8135 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8136 "clz %[iBit], %[uVal]\n\t"
8137 : [uVal] "=r" (u64)
8138 , [iBit] "=r" (iBit)
8139 : "[uVal]" (u64));
8140 return (unsigned)iBit;
8141
8142# elif defined(__GNUC__) && ARCH_BITS == 64
8143 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8144 return u64 ? __builtin_ctzl(u64) : 64;
8145
8146# else
8147 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8148 if (iBit == 32)
8149 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8150 return iBit;
8151# endif
8152}
8153#endif
8154
8155
8156/**
8157 * Count the number of trailing zero bits in the given 16-bit integer.
8158 *
8159 * The counting starts with the most significate bit.
8160 *
8161 * @returns Number of most significant zero bits.
8162 * @returns 16 if all bits are cleared.
8163 * @param u16 Integer to consider.
8164 */
8165#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8166RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8167#else
8168DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8169{
8170# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8171 uint16_t iBit;
8172 __asm__ __volatile__("bsfw %1, %0\n\t"
8173 "jnz 1f\n\t"
8174 "mov $16, %0\n\t"
8175 "1:\n\t"
8176 : "=r" (iBit)
8177 : "rm" (u16)
8178 : "cc");
8179 return iBit;
8180# else
8181 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8182#endif
8183}
8184#endif
8185
8186
8187/**
8188 * Rotate 32-bit unsigned value to the left by @a cShift.
8189 *
8190 * @returns Rotated value.
8191 * @param u32 The value to rotate.
8192 * @param cShift How many bits to rotate by.
8193 */
8194#ifdef __WATCOMC__
8195RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8196#else
8197DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8198{
8199# if RT_INLINE_ASM_USES_INTRIN
8200 return _rotl(u32, cShift);
8201
8202# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8203 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8204 return u32;
8205
8206# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8207 __asm__ __volatile__(
8208# if defined(RT_ARCH_ARM64)
8209 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8210# else
8211 "ror %[uRet], %[uVal], %[cShift]\n\t"
8212# endif
8213 : [uRet] "=r" (u32)
8214 : [uVal] "[uRet]" (u32)
8215 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8216 return u32;
8217
8218# else
8219 cShift &= 31;
8220 return (u32 << cShift) | (u32 >> (32 - cShift));
8221# endif
8222}
8223#endif
8224
8225
8226/**
8227 * Rotate 32-bit unsigned value to the right by @a cShift.
8228 *
8229 * @returns Rotated value.
8230 * @param u32 The value to rotate.
8231 * @param cShift How many bits to rotate by.
8232 */
8233#ifdef __WATCOMC__
8234RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8235#else
8236DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8237{
8238# if RT_INLINE_ASM_USES_INTRIN
8239 return _rotr(u32, cShift);
8240
8241# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8242 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8243 return u32;
8244
8245# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8246 __asm__ __volatile__(
8247# if defined(RT_ARCH_ARM64)
8248 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8249# else
8250 "ror %[uRet], %[uVal], %[cShift]\n\t"
8251# endif
8252 : [uRet] "=r" (u32)
8253 : [uVal] "[uRet]" (u32)
8254 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8255 return u32;
8256
8257# else
8258 cShift &= 31;
8259 return (u32 >> cShift) | (u32 << (32 - cShift));
8260# endif
8261}
8262#endif
8263
8264
8265/**
8266 * Rotate 64-bit unsigned value to the left by @a cShift.
8267 *
8268 * @returns Rotated value.
8269 * @param u64 The value to rotate.
8270 * @param cShift How many bits to rotate by.
8271 */
8272DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8273{
8274#if RT_INLINE_ASM_USES_INTRIN
8275 return _rotl64(u64, cShift);
8276
8277#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8278 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8279 return u64;
8280
8281#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8282 uint32_t uSpill;
8283 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8284 "jz 1f\n\t"
8285 "xchgl %%eax, %%edx\n\t"
8286 "1:\n\t"
8287 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8288 "jz 2f\n\t"
8289 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8290 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8291 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8292 "2:\n\t" /* } */
8293 : "=A" (u64)
8294 , "=c" (cShift)
8295 , "=r" (uSpill)
8296 : "0" (u64)
8297 , "1" (cShift)
8298 : "cc");
8299 return u64;
8300
8301# elif defined(RT_ARCH_ARM64)
8302 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8303 : [uRet] "=r" (u64)
8304 : [uVal] "[uRet]" (u64)
8305 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8306 return u64;
8307
8308#else
8309 cShift &= 63;
8310 return (u64 << cShift) | (u64 >> (64 - cShift));
8311#endif
8312}
8313
8314
8315/**
8316 * Rotate 64-bit unsigned value to the right by @a cShift.
8317 *
8318 * @returns Rotated value.
8319 * @param u64 The value to rotate.
8320 * @param cShift How many bits to rotate by.
8321 */
8322DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8323{
8324#if RT_INLINE_ASM_USES_INTRIN
8325 return _rotr64(u64, cShift);
8326
8327#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8328 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8329 return u64;
8330
8331#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8332 uint32_t uSpill;
8333 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8334 "jz 1f\n\t"
8335 "xchgl %%eax, %%edx\n\t"
8336 "1:\n\t"
8337 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8338 "jz 2f\n\t"
8339 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8340 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
8341 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
8342 "2:\n\t" /* } */
8343 : "=A" (u64)
8344 , "=c" (cShift)
8345 , "=r" (uSpill)
8346 : "0" (u64)
8347 , "1" (cShift)
8348 : "cc");
8349 return u64;
8350
8351# elif defined(RT_ARCH_ARM64)
8352 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8353 : [uRet] "=r" (u64)
8354 : [uVal] "[uRet]" (u64)
8355 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
8356 return u64;
8357
8358#else
8359 cShift &= 63;
8360 return (u64 >> cShift) | (u64 << (64 - cShift));
8361#endif
8362}
8363
8364/** @} */
8365
8366
8367/** @} */
8368
8369/*
8370 * Include #pragma aux definitions for Watcom C/C++.
8371 */
8372#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
8373# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
8374# undef IPRT_INCLUDED_asm_watcom_x86_16_h
8375# include "asm-watcom-x86-16.h"
8376#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
8377# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
8378# undef IPRT_INCLUDED_asm_watcom_x86_32_h
8379# include "asm-watcom-x86-32.h"
8380#endif
8381
8382#endif /* !IPRT_INCLUDED_asm_h */
8383
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use