VirtualBox

source: vbox/trunk/include/iprt/asm.h@ 103224

Last change on this file since 103224 was 103082, checked in by vboxsync, 9 months ago

fix doxygen complain

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 277.6 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# pragma intrinsic(__cpuid)
60# pragma intrinsic(__stosd)
61# pragma intrinsic(__stosw)
62# pragma intrinsic(__stosb)
63# pragma intrinsic(_BitScanForward)
64# pragma intrinsic(_BitScanReverse)
65# pragma intrinsic(_bittest)
66# pragma intrinsic(_bittestandset)
67# pragma intrinsic(_bittestandreset)
68# pragma intrinsic(_bittestandcomplement)
69# pragma intrinsic(_byteswap_ushort)
70# pragma intrinsic(_byteswap_ulong)
71# pragma intrinsic(_interlockedbittestandset)
72# pragma intrinsic(_interlockedbittestandreset)
73# pragma intrinsic(_InterlockedAnd)
74# pragma intrinsic(_InterlockedOr)
75# pragma intrinsic(_InterlockedXor)
76# pragma intrinsic(_InterlockedIncrement)
77# pragma intrinsic(_InterlockedDecrement)
78# pragma intrinsic(_InterlockedExchange)
79# pragma intrinsic(_InterlockedExchangeAdd)
80# pragma intrinsic(_InterlockedCompareExchange)
81# pragma intrinsic(_InterlockedCompareExchange8)
82# pragma intrinsic(_InterlockedCompareExchange16)
83# pragma intrinsic(_InterlockedCompareExchange64)
84# pragma intrinsic(_rotl)
85# pragma intrinsic(_rotr)
86# pragma intrinsic(_rotl64)
87# pragma intrinsic(_rotr64)
88# ifdef RT_ARCH_AMD64
89# pragma intrinsic(__stosq)
90# pragma intrinsic(_byteswap_uint64)
91# pragma intrinsic(_InterlockedCompareExchange128)
92# pragma intrinsic(_InterlockedExchange64)
93# pragma intrinsic(_InterlockedExchangeAdd64)
94# pragma intrinsic(_InterlockedAnd64)
95# pragma intrinsic(_InterlockedOr64)
96# pragma intrinsic(_InterlockedIncrement64)
97# pragma intrinsic(_InterlockedDecrement64)
98# endif
99#endif
100
101#if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
102/** @def RTASM_ARM64_USE_FEAT_LSE
103 * Use instructions from the FEAT_LSE set to implement atomic operations,
104 * assuming that the host CPU always supports these. */
105# define RTASM_ARM64_USE_FEAT_LSE 1
106/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
107 * Set to use DMB w/o barrier in most places and rely on the acquire-release
108 * aspects to do the serializing. The assumption is that the tstRTInline
109 * benchmark may be skewing the results testing an unusual scenario. */
110# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
111#endif
112
113
114/*
115 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
116 */
117#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
118# include "asm-watcom-x86-16.h"
119#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
120# include "asm-watcom-x86-32.h"
121#endif
122
123
124/** @defgroup grp_rt_asm ASM - Assembly Routines
125 * @ingroup grp_rt
126 *
127 * @remarks The difference between ordered and unordered atomic operations are
128 * that the former will complete outstanding reads and writes before
129 * continuing while the latter doesn't make any promises about the
130 * order. Ordered operations doesn't, it seems, make any 100% promise
131 * wrt to whether the operation will complete before any subsequent
132 * memory access. (please, correct if wrong.)
133 *
134 * ASMAtomicSomething operations are all ordered, while
135 * ASMAtomicUoSomething are unordered (note the Uo).
136 *
137 * Please note that ordered operations does not necessarily imply a
138 * compiler (memory) barrier. The user has to use the
139 * ASMCompilerBarrier() macro when that is deemed necessary.
140 *
141 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
142 * to reorder or even optimize assembler instructions away. For
143 * instance, in the following code the second rdmsr instruction is
144 * optimized away because gcc treats that instruction as deterministic:
145 *
146 * @code
147 * static inline uint64_t rdmsr_low(int idx)
148 * {
149 * uint32_t low;
150 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
151 * }
152 * ...
153 * uint32_t msr1 = rdmsr_low(1);
154 * foo(msr1);
155 * msr1 = rdmsr_low(1);
156 * bar(msr1);
157 * @endcode
158 *
159 * The input parameter of rdmsr_low is the same for both calls and
160 * therefore gcc will use the result of the first call as input
161 * parameter for bar() as well. For rdmsr this is not acceptable as
162 * this instruction is _not_ deterministic. This applies to reading
163 * machine status information in general.
164 *
165 * @{
166 */
167
168
169/** @def RT_INLINE_ASM_GCC_4_3_X_X86
170 * Used to work around some 4.3.x register allocation issues in this version of
171 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
172 * definitely not for 5.x */
173#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
174# define RT_INLINE_ASM_GCC_4_3_X_X86 1
175#else
176# define RT_INLINE_ASM_GCC_4_3_X_X86 0
177#endif
178
179/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
180 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
181 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
182 * mode, x86.
183 *
184 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
185 * when in PIC mode on x86.
186 */
187#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
188# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
189# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
190# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
191# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
192# elif ( (defined(PIC) || defined(__PIC__)) \
193 && defined(RT_ARCH_X86) \
194 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
195 || defined(RT_OS_DARWIN)) )
196# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
197# else
198# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
199# endif
200#endif
201
202
203/*
204 * ARM is great fun.
205 */
206#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
207
208# define RTASM_ARM_NO_BARRIER
209# ifdef RT_ARCH_ARM64
210# define RTASM_ARM_NO_BARRIER_IN_REG
211# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
212# define RTASM_ARM_DSB_SY "dsb sy\n\t"
213# define RTASM_ARM_DSB_SY_IN_REG
214# define RTASM_ARM_DSB_SY_COMMA_IN_REG
215# define RTASM_ARM_DMB_SY "dmb sy\n\t"
216# define RTASM_ARM_DMB_SY_IN_REG
217# define RTASM_ARM_DMB_SY_COMMA_IN_REG
218# define RTASM_ARM_DMB_ST "dmb st\n\t"
219# define RTASM_ARM_DMB_ST_IN_REG
220# define RTASM_ARM_DMB_ST_COMMA_IN_REG
221# define RTASM_ARM_DMB_LD "dmb ld\n\t"
222# define RTASM_ARM_DMB_LD_IN_REG
223# define RTASM_ARM_DMB_LD_COMMA_IN_REG
224# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
225# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
226 uint32_t rcSpill; \
227 uint32_t u32NewRet; \
228 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
229 RTASM_ARM_##barrier_type /* before lable? */ \
230 "ldaxr %w[uNew], %[pMem]\n\t" \
231 modify64 \
232 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
233 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
234 : [pMem] "+Q" (*a_pu32Mem) \
235 , [uNew] "=&r" (u32NewRet) \
236 , [rc] "=&r" (rcSpill) \
237 : in_reg \
238 : "cc")
239# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
240 uint32_t rcSpill; \
241 uint32_t u32OldRet; \
242 uint32_t u32NewSpill; \
243 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
244 RTASM_ARM_##barrier_type /* before lable? */ \
245 "ldaxr %w[uOld], %[pMem]\n\t" \
246 modify64 \
247 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
248 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
249 : [pMem] "+Q" (*a_pu32Mem) \
250 , [uOld] "=&r" (u32OldRet) \
251 , [uNew] "=&r" (u32NewSpill) \
252 , [rc] "=&r" (rcSpill) \
253 : in_reg \
254 : "cc")
255# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
256 uint32_t rcSpill; \
257 uint64_t u64NewRet; \
258 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
259 RTASM_ARM_##barrier_type /* before lable? */ \
260 "ldaxr %[uNew], %[pMem]\n\t" \
261 modify64 \
262 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
263 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
264 : [pMem] "+Q" (*a_pu64Mem) \
265 , [uNew] "=&r" (u64NewRet) \
266 , [rc] "=&r" (rcSpill) \
267 : in_reg \
268 : "cc")
269# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
270 uint32_t rcSpill; \
271 uint64_t u64OldRet; \
272 uint64_t u64NewSpill; \
273 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
274 RTASM_ARM_##barrier_type /* before lable? */ \
275 "ldaxr %[uOld], %[pMem]\n\t" \
276 modify64 \
277 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
278 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
279 : [pMem] "+Q" (*a_pu64Mem) \
280 , [uOld] "=&r" (u64OldRet) \
281 , [uNew] "=&r" (u64NewSpill) \
282 , [rc] "=&r" (rcSpill) \
283 : in_reg \
284 : "cc")
285
286# else /* RT_ARCH_ARM32 */
287# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
288# if RT_ARCH_ARM32 >= 7
289# warning armv7
290# define RTASM_ARM_NO_BARRIER_IN_REG
291# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
292# define RTASM_ARM_DSB_SY "dsb sy\n\t"
293# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
294# define RTASM_ARM_DMB_SY "dmb sy\n\t"
295# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
296# define RTASM_ARM_DMB_ST "dmb st\n\t"
297# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
298# define RTASM_ARM_DMB_LD "dmb ld\n\t"
299# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
300
301# elif RT_ARCH_ARM32 >= 6
302# warning armv6
303# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
304# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
305# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
306# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
307# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
308# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
309# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
310# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
311
312# elif RT_ARCH_ARM32 >= 4
313# warning armv5 or older
314# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
315# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
316# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
317# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
318# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
319# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
320# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
321# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
322# else
323# error "huh? Odd RT_ARCH_ARM32 value!"
324# endif
325# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
326# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
327# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
328# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
329# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
330 uint32_t rcSpill; \
331 uint32_t u32NewRet; \
332 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
333 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
334 "ldrex %[uNew], %[pMem]\n\t" \
335 modify32 \
336 "strex %[rc], %[uNew], %[pMem]\n\t" \
337 "cmp %[rc], #0\n\t" \
338 "bne Ltry_again_" #name "_%=\n\t" \
339 : [pMem] "+m" (*a_pu32Mem) \
340 , [uNew] "=&r" (u32NewRet) \
341 , [rc] "=&r" (rcSpill) \
342 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
343 , in_reg \
344 : "cc")
345# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
346 uint32_t rcSpill; \
347 uint32_t u32OldRet; \
348 uint32_t u32NewSpill; \
349 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
350 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
351 "ldrex %[uOld], %[pMem]\n\t" \
352 modify32 \
353 "strex %[rc], %[uNew], %[pMem]\n\t" \
354 "cmp %[rc], #0\n\t" \
355 "bne Ltry_again_" #name "_%=\n\t" \
356 : [pMem] "+m" (*a_pu32Mem) \
357 , [uOld] "=&r" (u32OldRet) \
358 , [uNew] "=&r" (u32NewSpill) \
359 , [rc] "=&r" (rcSpill) \
360 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
361 , in_reg \
362 : "cc")
363# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
364 uint32_t rcSpill; \
365 uint64_t u64NewRet; \
366 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
367 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
368 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
369 modify32 \
370 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
371 "cmp %[rc], #0\n\t" \
372 "bne Ltry_again_" #name "_%=\n\t" \
373 : [pMem] "+m" (*a_pu64Mem), \
374 [uNew] "=&r" (u64NewRet), \
375 [rc] "=&r" (rcSpill) \
376 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
377 , in_reg \
378 : "cc")
379# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
380 uint32_t rcSpill; \
381 uint64_t u64OldRet; \
382 uint64_t u64NewSpill; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
386 modify32 \
387 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu64Mem), \
391 [uOld] "=&r" (u64OldRet), \
392 [uNew] "=&r" (u64NewSpill), \
393 [rc] "=&r" (rcSpill) \
394 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
395 , in_reg \
396 : "cc")
397# endif /* RT_ARCH_ARM32 */
398#endif
399
400
401/** @def ASMReturnAddress
402 * Gets the return address of the current (or calling if you like) function or method.
403 */
404#ifdef _MSC_VER
405# ifdef __cplusplus
406extern "C"
407# endif
408void * _ReturnAddress(void);
409# pragma intrinsic(_ReturnAddress)
410# define ASMReturnAddress() _ReturnAddress()
411#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
412# define ASMReturnAddress() __builtin_return_address(0)
413#elif defined(__WATCOMC__)
414# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
415#else
416# error "Unsupported compiler."
417#endif
418
419
420/**
421 * Compiler memory barrier.
422 *
423 * Ensure that the compiler does not use any cached (register/tmp stack) memory
424 * values or any outstanding writes when returning from this function.
425 *
426 * This function must be used if non-volatile data is modified by a
427 * device or the VMM. Typical cases are port access, MMIO access,
428 * trapping instruction, etc.
429 */
430#if RT_INLINE_ASM_GNU_STYLE
431# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
432#elif RT_INLINE_ASM_USES_INTRIN
433# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
434#elif defined(__WATCOMC__)
435void ASMCompilerBarrier(void);
436#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
437DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
438{
439 __asm
440 {
441 }
442}
443#endif
444
445
446/** @def ASMBreakpoint
447 * Debugger Breakpoint.
448 * @deprecated Use RT_BREAKPOINT instead.
449 * @internal
450 */
451#define ASMBreakpoint() RT_BREAKPOINT()
452
453
454/**
455 * Spinloop hint for platforms that have these, empty function on the other
456 * platforms.
457 *
458 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
459 * spin locks.
460 */
461#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
462RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
463#else
464DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
465{
466# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
467# if RT_INLINE_ASM_GNU_STYLE
468 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
469# else
470 __asm {
471 _emit 0f3h
472 _emit 090h
473 }
474# endif
475
476# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
477 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
478
479# else
480 /* dummy */
481# endif
482}
483#endif
484
485
486/**
487 * Atomically Exchange an unsigned 8-bit value, ordered.
488 *
489 * @returns Current *pu8 value
490 * @param pu8 Pointer to the 8-bit variable to update.
491 * @param u8 The 8-bit value to assign to *pu8.
492 */
493#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
494RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
495#else
496DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
497{
498# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
499# if RT_INLINE_ASM_GNU_STYLE
500 __asm__ __volatile__("xchgb %0, %1\n\t"
501 : "=m" (*pu8)
502 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
503 : "1" (u8)
504 , "m" (*pu8));
505# else
506 __asm
507 {
508# ifdef RT_ARCH_AMD64
509 mov rdx, [pu8]
510 mov al, [u8]
511 xchg [rdx], al
512 mov [u8], al
513# else
514 mov edx, [pu8]
515 mov al, [u8]
516 xchg [edx], al
517 mov [u8], al
518# endif
519 }
520# endif
521 return u8;
522
523# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
524 uint32_t uOld;
525# if defined(RTASM_ARM64_USE_FEAT_LSE)
526 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
527 have the barrier we shouldn't need that, right? Ordering should be taken
528 care of by the DMB. The SWPB is rather cheap (~70% faster). */
529 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
530# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
531 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
532# else
533 RTASM_ARM_DMB_SY
534 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
535# endif
536 : [pMem] "+Q" (*pu8)
537 , [uOld] "=&r" (uOld)
538 : [uNew] "r" ((uint32_t)u8)
539 : );
540# else
541 uint32_t rcSpill;
542 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
543 RTASM_ARM_DMB_SY
544# if defined(RT_ARCH_ARM64)
545 "ldaxrb %w[uOld], %[pMem]\n\t"
546 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
547 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
548# else
549 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
550 "strexb %[rc], %[uNew], %[pMem]\n\t"
551 "cmp %[rc], #0\n\t"
552 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
553# endif
554 : [pMem] "+Q" (*pu8)
555 , [uOld] "=&r" (uOld)
556 , [rc] "=&r" (rcSpill)
557 : [uNew] "r" ((uint32_t)u8)
558 RTASM_ARM_DMB_SY_COMMA_IN_REG
559 : "cc");
560# endif
561 return (uint8_t)uOld;
562
563# else
564# error "Port me"
565# endif
566}
567#endif
568
569
570/**
571 * Atomically Exchange a signed 8-bit value, ordered.
572 *
573 * @returns Current *pu8 value
574 * @param pi8 Pointer to the 8-bit variable to update.
575 * @param i8 The 8-bit value to assign to *pi8.
576 */
577DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
578{
579 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
580}
581
582
583/**
584 * Atomically Exchange a bool value, ordered.
585 *
586 * @returns Current *pf value
587 * @param pf Pointer to the 8-bit variable to update.
588 * @param f The 8-bit value to assign to *pi8.
589 */
590DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
591{
592#ifdef _MSC_VER
593 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
594#else
595 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
596#endif
597}
598
599
600/**
601 * Atomically Exchange an unsigned 16-bit value, ordered.
602 *
603 * @returns Current *pu16 value
604 * @param pu16 Pointer to the 16-bit variable to update.
605 * @param u16 The 16-bit value to assign to *pu16.
606 */
607#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
608RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
609#else
610DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
611{
612# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
613# if RT_INLINE_ASM_GNU_STYLE
614 __asm__ __volatile__("xchgw %0, %1\n\t"
615 : "=m" (*pu16)
616 , "=r" (u16)
617 : "1" (u16)
618 , "m" (*pu16));
619# else
620 __asm
621 {
622# ifdef RT_ARCH_AMD64
623 mov rdx, [pu16]
624 mov ax, [u16]
625 xchg [rdx], ax
626 mov [u16], ax
627# else
628 mov edx, [pu16]
629 mov ax, [u16]
630 xchg [edx], ax
631 mov [u16], ax
632# endif
633 }
634# endif
635 return u16;
636
637# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
638 uint32_t uOld;
639# if defined(RTASM_ARM64_USE_FEAT_LSE)
640 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
641 slower if we remove the barrier. But since we have the barrier we
642 shouldn't need that, right? Ordering should be taken care of by the DMB.
643 The SWPH is rather cheap (~70% faster). */
644 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
645# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
646 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
647# else
648 RTASM_ARM_DMB_SY
649 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
650# endif
651 : [pMem] "+Q" (*pu16)
652 , [uOld] "=&r" (uOld)
653 : [uNew] "r" ((uint32_t)u16)
654 : );
655# else
656 uint32_t rcSpill;
657 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
658 RTASM_ARM_DMB_SY
659# if defined(RT_ARCH_ARM64)
660 "ldaxrh %w[uOld], %[pMem]\n\t"
661 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
662 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
663# else
664 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
665 "strexh %[rc], %[uNew], %[pMem]\n\t"
666 "cmp %[rc], #0\n\t"
667 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
668# endif
669 : [pMem] "+Q" (*pu16)
670 , [uOld] "=&r" (uOld)
671 , [rc] "=&r" (rcSpill)
672 : [uNew] "r" ((uint32_t)u16)
673 RTASM_ARM_DMB_SY_COMMA_IN_REG
674 : "cc");
675# endif
676 return (uint16_t)uOld;
677
678# else
679# error "Port me"
680# endif
681}
682#endif
683
684
685/**
686 * Atomically Exchange a signed 16-bit value, ordered.
687 *
688 * @returns Current *pu16 value
689 * @param pi16 Pointer to the 16-bit variable to update.
690 * @param i16 The 16-bit value to assign to *pi16.
691 */
692DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
693{
694 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
695}
696
697
698/**
699 * Atomically Exchange an unsigned 32-bit value, ordered.
700 *
701 * @returns Current *pu32 value
702 * @param pu32 Pointer to the 32-bit variable to update.
703 * @param u32 The 32-bit value to assign to *pu32.
704 *
705 * @remarks Does not work on 286 and earlier.
706 */
707#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
708RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
709#else
710DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
711{
712# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
713# if RT_INLINE_ASM_GNU_STYLE
714 __asm__ __volatile__("xchgl %0, %1\n\t"
715 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
716 , "=r" (u32)
717 : "1" (u32)
718 , "m" (*pu32));
719
720# elif RT_INLINE_ASM_USES_INTRIN
721 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
722
723# else
724 __asm
725 {
726# ifdef RT_ARCH_AMD64
727 mov rdx, [pu32]
728 mov eax, u32
729 xchg [rdx], eax
730 mov [u32], eax
731# else
732 mov edx, [pu32]
733 mov eax, u32
734 xchg [edx], eax
735 mov [u32], eax
736# endif
737 }
738# endif
739 return u32;
740
741# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
742 uint32_t uOld;
743# if defined(RTASM_ARM64_USE_FEAT_LSE)
744 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
745 slower if we remove the barrier. But since we have the barrier we
746 shouldn't need that, right? Ordering should be taken care of by the DMB.
747 The SWP is rather cheap (~70% faster). */
748 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
749# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
750 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
751# else
752 RTASM_ARM_DMB_SY
753 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
754# endif
755 : [pMem] "+Q" (*pu32)
756 , [uOld] "=&r" (uOld)
757 : [uNew] "r" (u32)
758 : );
759# else
760 uint32_t rcSpill;
761 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
762 RTASM_ARM_DMB_SY
763# if defined(RT_ARCH_ARM64)
764 "ldaxr %w[uOld], %[pMem]\n\t"
765 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
766 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
767# else
768 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
769 "strex %[rc], %[uNew], %[pMem]\n\t"
770 "cmp %[rc], #0\n\t"
771 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
772# endif
773 : [pMem] "+Q" (*pu32)
774 , [uOld] "=&r" (uOld)
775 , [rc] "=&r" (rcSpill)
776 : [uNew] "r" (u32)
777 RTASM_ARM_DMB_SY_COMMA_IN_REG
778 : "cc");
779# endif
780 return uOld;
781
782# else
783# error "Port me"
784# endif
785}
786#endif
787
788
789/**
790 * Atomically Exchange a signed 32-bit value, ordered.
791 *
792 * @returns Current *pu32 value
793 * @param pi32 Pointer to the 32-bit variable to update.
794 * @param i32 The 32-bit value to assign to *pi32.
795 */
796DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
797{
798 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
799}
800
801
802/**
803 * Atomically Exchange an unsigned 64-bit value, ordered.
804 *
805 * @returns Current *pu64 value
806 * @param pu64 Pointer to the 64-bit variable to update.
807 * @param u64 The 64-bit value to assign to *pu64.
808 *
809 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
810 */
811#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
812 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
813RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
814#else
815DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
816{
817# if defined(RT_ARCH_AMD64)
818# if RT_INLINE_ASM_USES_INTRIN
819 return _InterlockedExchange64((__int64 *)pu64, u64);
820
821# elif RT_INLINE_ASM_GNU_STYLE
822 __asm__ __volatile__("xchgq %0, %1\n\t"
823 : "=m" (*pu64)
824 , "=r" (u64)
825 : "1" (u64)
826 , "m" (*pu64));
827 return u64;
828# else
829 __asm
830 {
831 mov rdx, [pu64]
832 mov rax, [u64]
833 xchg [rdx], rax
834 mov [u64], rax
835 }
836 return u64;
837# endif
838
839# elif defined(RT_ARCH_X86)
840# if RT_INLINE_ASM_GNU_STYLE
841# if defined(PIC) || defined(__PIC__)
842 uint32_t u32EBX = (uint32_t)u64;
843 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
844 "xchgl %%ebx, %3\n\t"
845 "1:\n\t"
846 "lock; cmpxchg8b (%5)\n\t"
847 "jnz 1b\n\t"
848 "movl %3, %%ebx\n\t"
849 /*"xchgl %%esi, %5\n\t"*/
850 : "=A" (u64)
851 , "=m" (*pu64)
852 : "0" (*pu64)
853 , "m" ( u32EBX )
854 , "c" ( (uint32_t)(u64 >> 32) )
855 , "S" (pu64)
856 : "cc");
857# else /* !PIC */
858 __asm__ __volatile__("1:\n\t"
859 "lock; cmpxchg8b %1\n\t"
860 "jnz 1b\n\t"
861 : "=A" (u64)
862 , "=m" (*pu64)
863 : "0" (*pu64)
864 , "b" ( (uint32_t)u64 )
865 , "c" ( (uint32_t)(u64 >> 32) )
866 : "cc");
867# endif
868# else
869 __asm
870 {
871 mov ebx, dword ptr [u64]
872 mov ecx, dword ptr [u64 + 4]
873 mov edi, pu64
874 mov eax, dword ptr [edi]
875 mov edx, dword ptr [edi + 4]
876 retry:
877 lock cmpxchg8b [edi]
878 jnz retry
879 mov dword ptr [u64], eax
880 mov dword ptr [u64 + 4], edx
881 }
882# endif
883 return u64;
884
885# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
886 uint64_t uOld;
887# if defined(RTASM_ARM64_USE_FEAT_LSE)
888 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
889 slower if we remove the barrier. But since we have the barrier we
890 shouldn't need that, right? Ordering should be taken care of by the DMB.
891 The SWP is rather cheap (~70% faster). */
892 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
893# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
894 "swpal %[uNew], %[uOld], %[pMem]\n\t"
895# else
896 RTASM_ARM_DMB_SY
897 "swp %[uNew], %[uOld], %[pMem]\n\t"
898# endif
899 : [pMem] "+Q" (*pu64)
900 , [uOld] "=&r" (uOld)
901 : [uNew] "r" (u64)
902 : );
903# else
904 uint32_t rcSpill;
905 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
906 RTASM_ARM_DMB_SY
907# if defined(RT_ARCH_ARM64)
908 "ldaxr %[uOld], %[pMem]\n\t"
909 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
910 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
911# else
912 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
913 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
914 "cmp %[rc], #0\n\t"
915 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
916# endif
917 : [pMem] "+Q" (*pu64)
918 , [uOld] "=&r" (uOld)
919 , [rc] "=&r" (rcSpill)
920 : [uNew] "r" (u64)
921 RTASM_ARM_DMB_SY_COMMA_IN_REG
922 : "cc");
923# endif
924 return uOld;
925
926# else
927# error "Port me"
928# endif
929}
930#endif
931
932
933/**
934 * Atomically Exchange an signed 64-bit value, ordered.
935 *
936 * @returns Current *pi64 value
937 * @param pi64 Pointer to the 64-bit variable to update.
938 * @param i64 The 64-bit value to assign to *pi64.
939 */
940DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
941{
942 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
943}
944
945
946/**
947 * Atomically Exchange a size_t value, ordered.
948 *
949 * @returns Current *ppv value
950 * @param puDst Pointer to the size_t variable to update.
951 * @param uNew The new value to assign to *puDst.
952 */
953DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
954{
955#if ARCH_BITS == 16
956 AssertCompile(sizeof(size_t) == 2);
957 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
958#elif ARCH_BITS == 32
959 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
960#elif ARCH_BITS == 64
961 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
962#else
963# error "ARCH_BITS is bogus"
964#endif
965}
966
967
968/**
969 * Atomically Exchange a pointer value, ordered.
970 *
971 * @returns Current *ppv value
972 * @param ppv Pointer to the pointer variable to update.
973 * @param pv The pointer value to assign to *ppv.
974 */
975DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
976{
977#if ARCH_BITS == 32 || ARCH_BITS == 16
978 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
979#elif ARCH_BITS == 64
980 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
981#else
982# error "ARCH_BITS is bogus"
983#endif
984}
985
986
987/**
988 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
989 *
990 * @returns Current *pv value
991 * @param ppv Pointer to the pointer variable to update.
992 * @param pv The pointer value to assign to *ppv.
993 * @param Type The type of *ppv, sans volatile.
994 */
995#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
996# define ASMAtomicXchgPtrT(ppv, pv, Type) \
997 __extension__ \
998 ({\
999 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1000 Type const pvTypeChecked = (pv); \
1001 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1002 pvTypeCheckedRet; \
1003 })
1004#else
1005# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1006 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1007#endif
1008
1009
1010/**
1011 * Atomically Exchange a raw-mode context pointer value, ordered.
1012 *
1013 * @returns Current *ppv value
1014 * @param ppvRC Pointer to the pointer variable to update.
1015 * @param pvRC The pointer value to assign to *ppv.
1016 */
1017DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1018{
1019 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1020}
1021
1022
1023/**
1024 * Atomically Exchange a ring-0 pointer value, ordered.
1025 *
1026 * @returns Current *ppv value
1027 * @param ppvR0 Pointer to the pointer variable to update.
1028 * @param pvR0 The pointer value to assign to *ppv.
1029 */
1030DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1031{
1032#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1033 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1034#elif R0_ARCH_BITS == 64
1035 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1036#else
1037# error "R0_ARCH_BITS is bogus"
1038#endif
1039}
1040
1041
1042/**
1043 * Atomically Exchange a ring-3 pointer value, ordered.
1044 *
1045 * @returns Current *ppv value
1046 * @param ppvR3 Pointer to the pointer variable to update.
1047 * @param pvR3 The pointer value to assign to *ppv.
1048 */
1049DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1050{
1051#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1052 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1053#elif R3_ARCH_BITS == 64
1054 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1055#else
1056# error "R3_ARCH_BITS is bogus"
1057#endif
1058}
1059
1060
1061/** @def ASMAtomicXchgHandle
1062 * Atomically Exchange a typical IPRT handle value, ordered.
1063 *
1064 * @param ph Pointer to the value to update.
1065 * @param hNew The new value to assigned to *pu.
1066 * @param phRes Where to store the current *ph value.
1067 *
1068 * @remarks This doesn't currently work for all handles (like RTFILE).
1069 */
1070#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1071# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1072 do { \
1073 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1074 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1075 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1076 } while (0)
1077#elif HC_ARCH_BITS == 64
1078# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1079 do { \
1080 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1081 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1082 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1083 } while (0)
1084#else
1085# error HC_ARCH_BITS
1086#endif
1087
1088
1089/**
1090 * Atomically Exchange a value which size might differ
1091 * between platforms or compilers, ordered.
1092 *
1093 * @param pu Pointer to the variable to update.
1094 * @param uNew The value to assign to *pu.
1095 * @todo This is busted as its missing the result argument.
1096 */
1097#define ASMAtomicXchgSize(pu, uNew) \
1098 do { \
1099 switch (sizeof(*(pu))) { \
1100 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1101 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1102 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1103 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1104 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1105 } \
1106 } while (0)
1107
1108/**
1109 * Atomically Exchange a value which size might differ
1110 * between platforms or compilers, ordered.
1111 *
1112 * @param pu Pointer to the variable to update.
1113 * @param uNew The value to assign to *pu.
1114 * @param puRes Where to store the current *pu value.
1115 */
1116#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1117 do { \
1118 switch (sizeof(*(pu))) { \
1119 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1120 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1121 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1122 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1123 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1124 } \
1125 } while (0)
1126
1127
1128
1129/**
1130 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1131 *
1132 * @returns true if xchg was done.
1133 * @returns false if xchg wasn't done.
1134 *
1135 * @param pu8 Pointer to the value to update.
1136 * @param u8New The new value to assigned to *pu8.
1137 * @param u8Old The old value to *pu8 compare with.
1138 *
1139 * @remarks x86: Requires a 486 or later.
1140 * @todo Rename ASMAtomicCmpWriteU8
1141 */
1142#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE
1143RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1144#else
1145DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1146{
1147# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1148 uint8_t u8Ret;
1149 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1150 "setz %1\n\t"
1151 : "=m" (*pu8)
1152 , "=qm" (u8Ret)
1153 , "=a" (u8Old)
1154 : "q" (u8New)
1155 , "2" (u8Old)
1156 , "m" (*pu8)
1157 : "cc");
1158 return (bool)u8Ret;
1159
1160# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1161 union { uint32_t u; bool f; } fXchg;
1162 uint32_t u32Spill;
1163# if defined(RTASM_ARM64_USE_FEAT_LSE)
1164 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1165# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1166 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1167# else
1168 RTASM_ARM_DMB_SY
1169 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1170# endif
1171 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1172 "cset %w[fXchg], eq\n\t"
1173 : [pMem] "+Q" (*pu8)
1174 , [uOldActual] "=&r" (u32Spill)
1175 , [fXchg] "=&r" (fXchg.u)
1176 : [uNew] "r" ((uint32_t)u8New)
1177 , [uOldOrg] "r" ((uint32_t)u8Old)
1178 , "[uOldActual]" ((uint32_t)u8Old)
1179 : "cc");
1180# else
1181 uint32_t rcSpill;
1182 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1183 RTASM_ARM_DMB_SY
1184# if defined(RT_ARCH_ARM64)
1185 "ldaxrb %w[uOld], %[pMem]\n\t"
1186 "cmp %w[uOld], %w[uCmp]\n\t"
1187 "bne 1f\n\t" /* stop here if not equal */
1188 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1189 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1190 "mov %w[fXchg], #1\n\t"
1191 "1:\n\t"
1192 "clrex\n\t"
1193# else
1194 "ldrexb %[uOld], %[pMem]\n\t"
1195 "teq %[uOld], %[uCmp]\n\t"
1196 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1197 "bne 1f\n\t" /* stop here if not equal */
1198 "cmp %[rc], #0\n\t"
1199 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1200 "mov %[fXchg], #1\n\t"
1201 "1:\n\t"
1202 /** @todo clrexne on armv7? */
1203# endif
1204 : [pMem] "+Q" (*pu8)
1205 , [uOld] "=&r" (u32Spill)
1206 , [rc] "=&r" (rcSpill)
1207 , [fXchg] "=&r" (fXchg.u)
1208 : [uCmp] "r" ((uint32_t)u8Old)
1209 , [uNew] "r" ((uint32_t)u8New)
1210 , "[fXchg]" (0)
1211 RTASM_ARM_DMB_SY_COMMA_IN_REG
1212 : "cc");
1213# endif
1214 return fXchg.f;
1215
1216# else
1217# error "Port me"
1218# endif
1219}
1220#endif
1221
1222
1223/**
1224 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1225 *
1226 * @returns true if xchg was done.
1227 * @returns false if xchg wasn't done.
1228 *
1229 * @param pi8 Pointer to the value to update.
1230 * @param i8New The new value to assigned to *pi8.
1231 * @param i8Old The old value to *pi8 compare with.
1232 *
1233 * @remarks x86: Requires a 486 or later.
1234 * @todo Rename ASMAtomicCmpWriteS8
1235 */
1236DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1237{
1238 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1239}
1240
1241
1242/**
1243 * Atomically Compare and Exchange a bool value, ordered.
1244 *
1245 * @returns true if xchg was done.
1246 * @returns false if xchg wasn't done.
1247 *
1248 * @param pf Pointer to the value to update.
1249 * @param fNew The new value to assigned to *pf.
1250 * @param fOld The old value to *pf compare with.
1251 *
1252 * @remarks x86: Requires a 486 or later.
1253 * @todo Rename ASMAtomicCmpWriteBool
1254 */
1255DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1256{
1257 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1258}
1259
1260
1261/**
1262 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1263 *
1264 * @returns true if xchg was done.
1265 * @returns false if xchg wasn't done.
1266 *
1267 * @param pu32 Pointer to the value to update.
1268 * @param u32New The new value to assigned to *pu32.
1269 * @param u32Old The old value to *pu32 compare with.
1270 *
1271 * @remarks x86: Requires a 486 or later.
1272 * @todo Rename ASMAtomicCmpWriteU32
1273 */
1274#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1275RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1276#else
1277DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1278{
1279# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1280# if RT_INLINE_ASM_GNU_STYLE
1281 uint8_t u8Ret;
1282 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1283 "setz %1\n\t"
1284 : "=m" (*pu32)
1285 , "=qm" (u8Ret)
1286 , "=a" (u32Old)
1287 : "r" (u32New)
1288 , "2" (u32Old)
1289 , "m" (*pu32)
1290 : "cc");
1291 return (bool)u8Ret;
1292
1293# elif RT_INLINE_ASM_USES_INTRIN
1294 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1295
1296# else
1297 uint32_t u32Ret;
1298 __asm
1299 {
1300# ifdef RT_ARCH_AMD64
1301 mov rdx, [pu32]
1302# else
1303 mov edx, [pu32]
1304# endif
1305 mov eax, [u32Old]
1306 mov ecx, [u32New]
1307# ifdef RT_ARCH_AMD64
1308 lock cmpxchg [rdx], ecx
1309# else
1310 lock cmpxchg [edx], ecx
1311# endif
1312 setz al
1313 movzx eax, al
1314 mov [u32Ret], eax
1315 }
1316 return !!u32Ret;
1317# endif
1318
1319# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1320 union { uint32_t u; bool f; } fXchg;
1321 uint32_t u32Spill;
1322 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1323 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1324# if defined(RTASM_ARM64_USE_FEAT_LSE)
1325 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1326# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1327 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1328# else
1329 RTASM_ARM_DMB_SY
1330 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1331# endif
1332 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1333 "cset %w[fXchg], eq\n\t"
1334 : [pMem] "+Q" (*pu32)
1335 , [uOldActual] "=&r" (u32Spill)
1336 , [fXchg] "=&r" (fXchg.u)
1337 : [uNew] "r" (u32New)
1338 , [uOldOrg] "r" (u32Old)
1339 , "[uOldActual]" (u32Old)
1340 : "cc");
1341# else
1342 uint32_t rcSpill;
1343 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1344 RTASM_ARM_DMB_SY
1345# if defined(RT_ARCH_ARM64)
1346 "ldaxr %w[uOld], %[pMem]\n\t"
1347 "cmp %w[uOld], %w[uCmp]\n\t"
1348 "bne 1f\n\t" /* stop here if not equal */
1349 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1350 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1351 "mov %w[fXchg], #1\n\t"
1352 "1:\n\t"
1353 "clrex\n\t"
1354# else
1355 "ldrex %[uOld], %[pMem]\n\t"
1356 "teq %[uOld], %[uCmp]\n\t"
1357 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1358 "bne 1f\n\t" /* stop here if not equal */
1359 "cmp %[rc], #0\n\t"
1360 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1361 "mov %[fXchg], #1\n\t"
1362 "1:\n\t"
1363 /** @todo clrexne on armv7? */
1364# endif
1365 : [pMem] "+Q" (*pu32)
1366 , [uOld] "=&r" (u32Spill)
1367 , [rc] "=&r" (rcSpill)
1368 , [fXchg] "=&r" (fXchg.u)
1369 : [uCmp] "r" (u32Old)
1370 , [uNew] "r" (u32New)
1371 , "[fXchg]" (0)
1372 RTASM_ARM_DMB_SY_COMMA_IN_REG
1373 : "cc");
1374# endif
1375 return fXchg.f;
1376
1377# else
1378# error "Port me"
1379# endif
1380}
1381#endif
1382
1383
1384/**
1385 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1386 *
1387 * @returns true if xchg was done.
1388 * @returns false if xchg wasn't done.
1389 *
1390 * @param pi32 Pointer to the value to update.
1391 * @param i32New The new value to assigned to *pi32.
1392 * @param i32Old The old value to *pi32 compare with.
1393 *
1394 * @remarks x86: Requires a 486 or later.
1395 * @todo Rename ASMAtomicCmpWriteS32
1396 */
1397DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1398{
1399 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1400}
1401
1402
1403/**
1404 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1405 *
1406 * @returns true if xchg was done.
1407 * @returns false if xchg wasn't done.
1408 *
1409 * @param pu64 Pointer to the 64-bit variable to update.
1410 * @param u64New The 64-bit value to assign to *pu64.
1411 * @param u64Old The value to compare with.
1412 *
1413 * @remarks x86: Requires a Pentium or later.
1414 * @todo Rename ASMAtomicCmpWriteU64
1415 */
1416#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1417 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1418RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1419#else
1420DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1421{
1422# if RT_INLINE_ASM_USES_INTRIN
1423 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1424
1425# elif defined(RT_ARCH_AMD64)
1426# if RT_INLINE_ASM_GNU_STYLE
1427 uint8_t u8Ret;
1428 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1429 "setz %1\n\t"
1430 : "=m" (*pu64)
1431 , "=qm" (u8Ret)
1432 , "=a" (u64Old)
1433 : "r" (u64New)
1434 , "2" (u64Old)
1435 , "m" (*pu64)
1436 : "cc");
1437 return (bool)u8Ret;
1438# else
1439 bool fRet;
1440 __asm
1441 {
1442 mov rdx, [pu32]
1443 mov rax, [u64Old]
1444 mov rcx, [u64New]
1445 lock cmpxchg [rdx], rcx
1446 setz al
1447 mov [fRet], al
1448 }
1449 return fRet;
1450# endif
1451
1452# elif defined(RT_ARCH_X86)
1453 uint32_t u32Ret;
1454# if RT_INLINE_ASM_GNU_STYLE
1455# if defined(PIC) || defined(__PIC__)
1456 uint32_t u32EBX = (uint32_t)u64New;
1457 uint32_t u32Spill;
1458 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1459 "lock; cmpxchg8b (%6)\n\t"
1460 "setz %%al\n\t"
1461 "movl %4, %%ebx\n\t"
1462 "movzbl %%al, %%eax\n\t"
1463 : "=a" (u32Ret)
1464 , "=d" (u32Spill)
1465# if RT_GNUC_PREREQ(4, 3)
1466 , "+m" (*pu64)
1467# else
1468 , "=m" (*pu64)
1469# endif
1470 : "A" (u64Old)
1471 , "m" ( u32EBX )
1472 , "c" ( (uint32_t)(u64New >> 32) )
1473 , "S" (pu64)
1474 : "cc");
1475# else /* !PIC */
1476 uint32_t u32Spill;
1477 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1478 "setz %%al\n\t"
1479 "movzbl %%al, %%eax\n\t"
1480 : "=a" (u32Ret)
1481 , "=d" (u32Spill)
1482 , "+m" (*pu64)
1483 : "A" (u64Old)
1484 , "b" ( (uint32_t)u64New )
1485 , "c" ( (uint32_t)(u64New >> 32) )
1486 : "cc");
1487# endif
1488 return (bool)u32Ret;
1489# else
1490 __asm
1491 {
1492 mov ebx, dword ptr [u64New]
1493 mov ecx, dword ptr [u64New + 4]
1494 mov edi, [pu64]
1495 mov eax, dword ptr [u64Old]
1496 mov edx, dword ptr [u64Old + 4]
1497 lock cmpxchg8b [edi]
1498 setz al
1499 movzx eax, al
1500 mov dword ptr [u32Ret], eax
1501 }
1502 return !!u32Ret;
1503# endif
1504
1505# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1506 union { uint32_t u; bool f; } fXchg;
1507 uint64_t u64Spill;
1508 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1509 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1510# if defined(RTASM_ARM64_USE_FEAT_LSE)
1511 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1512# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1513 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1514# else
1515 RTASM_ARM_DMB_SY
1516 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1517# endif
1518 "cmp %[uOldActual], %[uOldOrg]\n\t"
1519 "cset %w[fXchg], eq\n\t"
1520 : [pMem] "+Q" (*pu64)
1521 , [uOldActual] "=&r" (u64Spill)
1522 , [fXchg] "=&r" (fXchg.u)
1523 : [uNew] "r" (u64New)
1524 , [uOldOrg] "r" (u64Old)
1525 , "[uOldActual]" (u64Old)
1526 : "cc");
1527# else
1528 uint32_t rcSpill;
1529 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1530 RTASM_ARM_DMB_SY
1531# if defined(RT_ARCH_ARM64)
1532 "ldaxr %[uOld], %[pMem]\n\t"
1533 "cmp %[uOld], %[uCmp]\n\t"
1534 "bne 1f\n\t" /* stop here if not equal */
1535 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1536 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1537 "mov %w[fXchg], #1\n\t"
1538 "1:\n\t"
1539 "clrex\n\t"
1540# else
1541 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1542 "teq %[uOld], %[uCmp]\n\t"
1543 "teqeq %H[uOld], %H[uCmp]\n\t"
1544 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1545 "bne 1f\n\t" /* stop here if not equal */
1546 "cmp %[rc], #0\n\t"
1547 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1548 "mov %[fXchg], #1\n\t"
1549 "1:\n\t"
1550 /** @todo clrexne on armv7? */
1551# endif
1552 : [pMem] "+Q" (*pu64)
1553 , [uOld] "=&r" (u64Spill)
1554 , [rc] "=&r" (rcSpill)
1555 , [fXchg] "=&r" (fXchg.u)
1556 : [uCmp] "r" (u64Old)
1557 , [uNew] "r" (u64New)
1558 , "[fXchg]" (0)
1559 RTASM_ARM_DMB_SY_COMMA_IN_REG
1560 : "cc");
1561# endif
1562 return fXchg.f;
1563
1564# else
1565# error "Port me"
1566# endif
1567}
1568#endif
1569
1570
1571/**
1572 * Atomically Compare and exchange a signed 64-bit value, ordered.
1573 *
1574 * @returns true if xchg was done.
1575 * @returns false if xchg wasn't done.
1576 *
1577 * @param pi64 Pointer to the 64-bit variable to update.
1578 * @param i64 The 64-bit value to assign to *pu64.
1579 * @param i64Old The value to compare with.
1580 *
1581 * @remarks x86: Requires a Pentium or later.
1582 * @todo Rename ASMAtomicCmpWriteS64
1583 */
1584DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1585{
1586 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1587}
1588
1589#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1590
1591/** @def RTASM_HAVE_CMP_WRITE_U128
1592 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1593 * and ASMAtomicCmpWriteExU128() available. */
1594# define RTASM_HAVE_CMP_WRITE_U128 1
1595
1596
1597/**
1598 * Atomically compare and write an unsigned 128-bit value, ordered.
1599 *
1600 * @returns true if write was done.
1601 * @returns false if write wasn't done.
1602 *
1603 * @param pu128 Pointer to the 128-bit variable to update.
1604 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1605 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1606 * @param u64OldHi The high 64-bit of the value to compare with.
1607 * @param u64OldLo The low 64-bit of the value to compare with.
1608 *
1609 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1610 */
1611# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1612DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1613 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1614# else
1615DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1616 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1617{
1618# if RT_INLINE_ASM_USES_INTRIN
1619 __int64 ai64Cmp[2];
1620 ai64Cmp[0] = u64OldLo;
1621 ai64Cmp[1] = u64OldHi;
1622 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1623
1624# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1625 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1626
1627# elif defined(RT_ARCH_AMD64)
1628# if RT_INLINE_ASM_GNU_STYLE
1629 uint64_t u64Ret;
1630 uint64_t u64Spill;
1631 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1632 "setz %%al\n\t"
1633 "movzbl %%al, %%eax\n\t"
1634 : "=a" (u64Ret)
1635 , "=d" (u64Spill)
1636 , "+m" (*pu128)
1637 : "a" (u64OldLo)
1638 , "d" (u64OldHi)
1639 , "b" (u64NewLo)
1640 , "c" (u64NewHi)
1641 : "cc");
1642
1643 return (bool)u64Ret;
1644# else
1645# error "Port me"
1646# endif
1647# else
1648# error "Port me"
1649# endif
1650}
1651# endif
1652
1653
1654/**
1655 * Atomically compare and write an unsigned 128-bit value, ordered.
1656 *
1657 * @returns true if write was done.
1658 * @returns false if write wasn't done.
1659 *
1660 * @param pu128 Pointer to the 128-bit variable to update.
1661 * @param u128New The 128-bit value to assign to *pu128.
1662 * @param u128Old The value to compare with.
1663 *
1664 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1665 */
1666DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1667{
1668# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1669# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1670 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1671# else
1672 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1673 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1674# endif
1675# else
1676 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1677# endif
1678}
1679
1680
1681/**
1682 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1683 */
1684DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1685 const RTUINT128U u128Old) RT_NOTHROW_DEF
1686{
1687# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1688 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1689# else
1690 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1691# endif
1692}
1693
1694#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1695
1696/**
1697 * Atomically Compare and Exchange a pointer value, ordered.
1698 *
1699 * @returns true if xchg was done.
1700 * @returns false if xchg wasn't done.
1701 *
1702 * @param ppv Pointer to the value to update.
1703 * @param pvNew The new value to assigned to *ppv.
1704 * @param pvOld The old value to *ppv compare with.
1705 *
1706 * @remarks x86: Requires a 486 or later.
1707 * @todo Rename ASMAtomicCmpWritePtrVoid
1708 */
1709DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1710{
1711#if ARCH_BITS == 32 || ARCH_BITS == 16
1712 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1713#elif ARCH_BITS == 64
1714 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1715#else
1716# error "ARCH_BITS is bogus"
1717#endif
1718}
1719
1720
1721/**
1722 * Atomically Compare and Exchange a pointer value, ordered.
1723 *
1724 * @returns true if xchg was done.
1725 * @returns false if xchg wasn't done.
1726 *
1727 * @param ppv Pointer to the value to update.
1728 * @param pvNew The new value to assigned to *ppv.
1729 * @param pvOld The old value to *ppv compare with.
1730 *
1731 * @remarks This is relatively type safe on GCC platforms.
1732 * @remarks x86: Requires a 486 or later.
1733 * @todo Rename ASMAtomicCmpWritePtr
1734 */
1735#ifdef __GNUC__
1736# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1737 __extension__ \
1738 ({\
1739 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1740 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1741 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1742 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1743 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1744 fMacroRet; \
1745 })
1746#else
1747# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1748 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1749#endif
1750
1751
1752/** @def ASMAtomicCmpXchgHandle
1753 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1754 *
1755 * @param ph Pointer to the value to update.
1756 * @param hNew The new value to assigned to *pu.
1757 * @param hOld The old value to *pu compare with.
1758 * @param fRc Where to store the result.
1759 *
1760 * @remarks This doesn't currently work for all handles (like RTFILE).
1761 * @remarks x86: Requires a 486 or later.
1762 * @todo Rename ASMAtomicCmpWriteHandle
1763 */
1764#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1765# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1766 do { \
1767 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1768 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1769 } while (0)
1770#elif HC_ARCH_BITS == 64
1771# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1772 do { \
1773 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1774 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1775 } while (0)
1776#else
1777# error HC_ARCH_BITS
1778#endif
1779
1780
1781/** @def ASMAtomicCmpXchgSize
1782 * Atomically Compare and Exchange a value which size might differ
1783 * between platforms or compilers, ordered.
1784 *
1785 * @param pu Pointer to the value to update.
1786 * @param uNew The new value to assigned to *pu.
1787 * @param uOld The old value to *pu compare with.
1788 * @param fRc Where to store the result.
1789 *
1790 * @remarks x86: Requires a 486 or later.
1791 * @todo Rename ASMAtomicCmpWriteSize
1792 */
1793#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1794 do { \
1795 switch (sizeof(*(pu))) { \
1796 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1797 break; \
1798 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1799 break; \
1800 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1801 (fRc) = false; \
1802 break; \
1803 } \
1804 } while (0)
1805
1806
1807/**
1808 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1809 * back old value, ordered.
1810 *
1811 * @returns true if xchg was done.
1812 * @returns false if xchg wasn't done.
1813 *
1814 * @param pu8 Pointer to the value to update.
1815 * @param u8New The new value to assigned to *pu32.
1816 * @param u8Old The old value to *pu8 compare with.
1817 * @param pu8Old Pointer store the old value at.
1818 *
1819 * @remarks x86: Requires a 486 or later.
1820 */
1821#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1822RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1823#else
1824DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1825{
1826# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1827# if RT_INLINE_ASM_GNU_STYLE
1828 uint8_t u8Ret;
1829 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1830 "setz %1\n\t"
1831 : "=m" (*pu8)
1832 , "=qm" (u8Ret)
1833 , "=a" (*pu8Old)
1834# if defined(RT_ARCH_X86)
1835 : "q" (u8New)
1836# else
1837 : "r" (u8New)
1838# endif
1839 , "a" (u8Old)
1840 , "m" (*pu8)
1841 : "cc");
1842 return (bool)u8Ret;
1843
1844# elif RT_INLINE_ASM_USES_INTRIN
1845 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1846
1847# else
1848 uint8_t u8Ret;
1849 __asm
1850 {
1851# ifdef RT_ARCH_AMD64
1852 mov rdx, [pu8]
1853# else
1854 mov edx, [pu8]
1855# endif
1856 mov eax, [u8Old]
1857 mov ecx, [u8New]
1858# ifdef RT_ARCH_AMD64
1859 lock cmpxchg [rdx], ecx
1860 mov rdx, [pu8Old]
1861 mov [rdx], eax
1862# else
1863 lock cmpxchg [edx], ecx
1864 mov edx, [pu8Old]
1865 mov [edx], eax
1866# endif
1867 setz al
1868 movzx eax, al
1869 mov [u8Ret], eax
1870 }
1871 return !!u8Ret;
1872# endif
1873
1874# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1875 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
1876 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
1877# if defined(RTASM_ARM64_USE_FEAT_LSE)
1878 union { uint32_t u; bool f; } fXchg;
1879 uint32_t u32Actual;
1880 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
1881# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1882 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1883# else
1884 RTASM_ARM_DMB_SY
1885 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1886# endif
1887 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1888 "cset %w[fXchg], eq\n\t"
1889 : [pMem] "+Q" (*pu8)
1890 , [uOldActual] "=&r" (u32Actual)
1891 , [fXchg] "=&r" (fXchg.u)
1892 : [uNew] "r" ((uint32_t)u8New)
1893 , [uOldOrg] "r" ((uint32_t)u8Old)
1894 , "[uOldActual]" ((uint32_t)u8Old)
1895 : "cc");
1896 *pu8Old = (uint8_t)u32Actual;
1897# else
1898 union { uint8_t u; bool f; } fXchg;
1899 uint8_t u8ActualOld;
1900 uint8_t rcSpill;
1901 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
1902 RTASM_ARM_DMB_SY
1903# if defined(RT_ARCH_ARM64)
1904 "ldaxrb %w[uOld], %[pMem]\n\t"
1905 "cmp %w[uOld], %w[uCmp]\n\t"
1906 "bne 1f\n\t" /* stop here if not equal */
1907 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1908 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1909 "mov %w[fXchg], #1\n\t"
1910 "1:\n\t"
1911 "clrex\n\t"
1912# else
1913 "ldrexb %[uOld], %[pMem]\n\t"
1914 "teq %[uOld], %[uCmp]\n\t"
1915 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1916 "bne 1f\n\t" /* stop here if not equal */
1917 "cmp %[rc], #0\n\t"
1918 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1919 "mov %[fXchg], #1\n\t"
1920 "1:\n\t"
1921 /** @todo clrexne on armv7? */
1922# endif
1923 : [pMem] "+Q" (*pu8)
1924 , [uOld] "=&r" (u8ActualOld)
1925 , [rc] "=&r" (rcSpill)
1926 , [fXchg] "=&r" (fXchg.u)
1927 : [uCmp] "r" (u8Old)
1928 , [uNew] "r" (u8New)
1929 , "[fXchg]" (0)
1930 RTASM_ARM_DMB_SY_COMMA_IN_REG
1931 : "cc");
1932 *pu8Old = u8ActualOld;
1933# endif
1934 return fXchg.f;
1935
1936# else
1937# error "Port me"
1938# endif
1939}
1940#endif
1941
1942
1943/**
1944 * Atomically Compare and Exchange a signed 8-bit value, additionally
1945 * passes back old value, ordered.
1946 *
1947 * @returns true if xchg was done.
1948 * @returns false if xchg wasn't done.
1949 *
1950 * @param pi8 Pointer to the value to update.
1951 * @param i8New The new value to assigned to *pi8.
1952 * @param i8Old The old value to *pi8 compare with.
1953 * @param pi8Old Pointer store the old value at.
1954 *
1955 * @remarks x86: Requires a 486 or later.
1956 */
1957DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
1958{
1959 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
1960}
1961
1962
1963/**
1964 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
1965 * back old value, ordered.
1966 *
1967 * @returns true if xchg was done.
1968 * @returns false if xchg wasn't done.
1969 *
1970 * @param pu16 Pointer to the value to update.
1971 * @param u16New The new value to assigned to *pu16.
1972 * @param u16Old The old value to *pu32 compare with.
1973 * @param pu16Old Pointer store the old value at.
1974 *
1975 * @remarks x86: Requires a 486 or later.
1976 */
1977#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1978RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
1979#else
1980DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
1981{
1982# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1983# if RT_INLINE_ASM_GNU_STYLE
1984 uint8_t u8Ret;
1985 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
1986 "setz %1\n\t"
1987 : "=m" (*pu16)
1988 , "=qm" (u8Ret)
1989 , "=a" (*pu16Old)
1990 : "r" (u16New)
1991 , "a" (u16Old)
1992 , "m" (*pu16)
1993 : "cc");
1994 return (bool)u8Ret;
1995
1996# elif RT_INLINE_ASM_USES_INTRIN
1997 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
1998
1999# else
2000 uint16_t u16Ret;
2001 __asm
2002 {
2003# ifdef RT_ARCH_AMD64
2004 mov rdx, [pu16]
2005# else
2006 mov edx, [pu16]
2007# endif
2008 mov eax, [u16Old]
2009 mov ecx, [u16New]
2010# ifdef RT_ARCH_AMD64
2011 lock cmpxchg [rdx], ecx
2012 mov rdx, [pu16Old]
2013 mov [rdx], eax
2014# else
2015 lock cmpxchg [edx], ecx
2016 mov edx, [pu16Old]
2017 mov [edx], eax
2018# endif
2019 setz al
2020 movzx eax, al
2021 mov [u16Ret], eax
2022 }
2023 return !!u16Ret;
2024# endif
2025
2026# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2027 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2028 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2029# if defined(RTASM_ARM64_USE_FEAT_LSE)
2030 union { uint32_t u; bool f; } fXchg;
2031 uint32_t u32Actual;
2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2033# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2034 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2035# else
2036 RTASM_ARM_DMB_SY
2037 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2038# endif
2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2040 "cset %w[fXchg], eq\n\t"
2041 : [pMem] "+Q" (*pu16)
2042 , [uOldActual] "=&r" (u32Actual)
2043 , [fXchg] "=&r" (fXchg.u)
2044 : [uNew] "r" ((uint32_t)u16New)
2045 , [uOldOrg] "r" ((uint32_t)u16Old)
2046 , "[uOldActual]" ((uint32_t)u16Old)
2047 : "cc");
2048 *pu16Old = (uint16_t)u32Actual;
2049# else
2050 union { uint16_t u; bool f; } fXchg;
2051 uint16_t u16ActualOld;
2052 uint16_t rcSpill;
2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2054 RTASM_ARM_DMB_SY
2055# if defined(RT_ARCH_ARM64)
2056 "ldaxrh %w[uOld], %[pMem]\n\t"
2057 "cmp %w[uOld], %w[uCmp]\n\t"
2058 "bne 1f\n\t" /* stop here if not equal */
2059 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2060 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2061 "mov %w[fXchg], #1\n\t"
2062 "1:\n\t"
2063 "clrex\n\t"
2064# else
2065 "ldrexh %[uOld], %[pMem]\n\t"
2066 "teq %[uOld], %[uCmp]\n\t"
2067 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2068 "bne 1f\n\t" /* stop here if not equal */
2069 "cmp %[rc], #0\n\t"
2070 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2071 "mov %[fXchg], #1\n\t"
2072 "1:\n\t"
2073 /** @todo clrexne on armv7? */
2074# endif
2075 : [pMem] "+Q" (*pu16)
2076 , [uOld] "=&r" (u16ActualOld)
2077 , [rc] "=&r" (rcSpill)
2078 , [fXchg] "=&r" (fXchg.u)
2079 : [uCmp] "r" (u16Old)
2080 , [uNew] "r" (u16New)
2081 , "[fXchg]" (0)
2082 RTASM_ARM_DMB_SY_COMMA_IN_REG
2083 : "cc");
2084 *pu16Old = u16ActualOld;
2085# endif
2086 return fXchg.f;
2087
2088# else
2089# error "Port me"
2090# endif
2091}
2092#endif
2093
2094
2095/**
2096 * Atomically Compare and Exchange a signed 16-bit value, additionally
2097 * passes back old value, ordered.
2098 *
2099 * @returns true if xchg was done.
2100 * @returns false if xchg wasn't done.
2101 *
2102 * @param pi16 Pointer to the value to update.
2103 * @param i16New The new value to assigned to *pi16.
2104 * @param i16Old The old value to *pi16 compare with.
2105 * @param pi16Old Pointer store the old value at.
2106 *
2107 * @remarks x86: Requires a 486 or later.
2108 */
2109DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2110{
2111 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2112}
2113
2114
2115/**
2116 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2117 * passes back old value, ordered.
2118 *
2119 * @returns true if xchg was done.
2120 * @returns false if xchg wasn't done.
2121 *
2122 * @param pu32 Pointer to the value to update.
2123 * @param u32New The new value to assigned to *pu32.
2124 * @param u32Old The old value to *pu32 compare with.
2125 * @param pu32Old Pointer store the old value at.
2126 *
2127 * @remarks x86: Requires a 486 or later.
2128 */
2129#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2130RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2131#else
2132DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2133{
2134# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2135# if RT_INLINE_ASM_GNU_STYLE
2136 uint8_t u8Ret;
2137 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2138 "setz %1\n\t"
2139 : "=m" (*pu32)
2140 , "=qm" (u8Ret)
2141 , "=a" (*pu32Old)
2142 : "r" (u32New)
2143 , "a" (u32Old)
2144 , "m" (*pu32)
2145 : "cc");
2146 return (bool)u8Ret;
2147
2148# elif RT_INLINE_ASM_USES_INTRIN
2149 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2150
2151# else
2152 uint32_t u32Ret;
2153 __asm
2154 {
2155# ifdef RT_ARCH_AMD64
2156 mov rdx, [pu32]
2157# else
2158 mov edx, [pu32]
2159# endif
2160 mov eax, [u32Old]
2161 mov ecx, [u32New]
2162# ifdef RT_ARCH_AMD64
2163 lock cmpxchg [rdx], ecx
2164 mov rdx, [pu32Old]
2165 mov [rdx], eax
2166# else
2167 lock cmpxchg [edx], ecx
2168 mov edx, [pu32Old]
2169 mov [edx], eax
2170# endif
2171 setz al
2172 movzx eax, al
2173 mov [u32Ret], eax
2174 }
2175 return !!u32Ret;
2176# endif
2177
2178# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2179 union { uint32_t u; bool f; } fXchg;
2180 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2181 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2182# if defined(RTASM_ARM64_USE_FEAT_LSE)
2183 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2184# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2185 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2186# else
2187 RTASM_ARM_DMB_SY
2188 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2189# endif
2190 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2191 "cset %w[fXchg], eq\n\t"
2192 : [pMem] "+Q" (*pu32)
2193 , [uOldActual] "=&r" (*pu32Old)
2194 , [fXchg] "=&r" (fXchg.u)
2195 : [uNew] "r" (u32New)
2196 , [uOldOrg] "r" (u32Old)
2197 , "[uOldActual]" (u32Old)
2198 : "cc");
2199# else
2200 uint32_t u32ActualOld;
2201 uint32_t rcSpill;
2202 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2203 RTASM_ARM_DMB_SY
2204# if defined(RT_ARCH_ARM64)
2205 "ldaxr %w[uOld], %[pMem]\n\t"
2206 "cmp %w[uOld], %w[uCmp]\n\t"
2207 "bne 1f\n\t" /* stop here if not equal */
2208 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2209 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2210 "mov %w[fXchg], #1\n\t"
2211 "1:\n\t"
2212 "clrex\n\t"
2213# else
2214 "ldrex %[uOld], %[pMem]\n\t"
2215 "teq %[uOld], %[uCmp]\n\t"
2216 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2217 "bne 1f\n\t" /* stop here if not equal */
2218 "cmp %[rc], #0\n\t"
2219 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2220 "mov %[fXchg], #1\n\t"
2221 "1:\n\t"
2222 /** @todo clrexne on armv7? */
2223# endif
2224 : [pMem] "+Q" (*pu32)
2225 , [uOld] "=&r" (u32ActualOld)
2226 , [rc] "=&r" (rcSpill)
2227 , [fXchg] "=&r" (fXchg.u)
2228 : [uCmp] "r" (u32Old)
2229 , [uNew] "r" (u32New)
2230 , "[fXchg]" (0)
2231 RTASM_ARM_DMB_SY_COMMA_IN_REG
2232 : "cc");
2233 *pu32Old = u32ActualOld;
2234# endif
2235 return fXchg.f;
2236
2237# else
2238# error "Port me"
2239# endif
2240}
2241#endif
2242
2243
2244/**
2245 * Atomically Compare and Exchange a signed 32-bit value, additionally
2246 * passes back old value, ordered.
2247 *
2248 * @returns true if xchg was done.
2249 * @returns false if xchg wasn't done.
2250 *
2251 * @param pi32 Pointer to the value to update.
2252 * @param i32New The new value to assigned to *pi32.
2253 * @param i32Old The old value to *pi32 compare with.
2254 * @param pi32Old Pointer store the old value at.
2255 *
2256 * @remarks x86: Requires a 486 or later.
2257 */
2258DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2259{
2260 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2261}
2262
2263
2264/**
2265 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2266 * passing back old value, ordered.
2267 *
2268 * @returns true if xchg was done.
2269 * @returns false if xchg wasn't done.
2270 *
2271 * @param pu64 Pointer to the 64-bit variable to update.
2272 * @param u64New The 64-bit value to assign to *pu64.
2273 * @param u64Old The value to compare with.
2274 * @param pu64Old Pointer store the old value at.
2275 *
2276 * @remarks x86: Requires a Pentium or later.
2277 */
2278#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2279 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2280RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2281#else
2282DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2283{
2284# if RT_INLINE_ASM_USES_INTRIN
2285 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2286
2287# elif defined(RT_ARCH_AMD64)
2288# if RT_INLINE_ASM_GNU_STYLE
2289 uint8_t u8Ret;
2290 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2291 "setz %1\n\t"
2292 : "=m" (*pu64)
2293 , "=qm" (u8Ret)
2294 , "=a" (*pu64Old)
2295 : "r" (u64New)
2296 , "a" (u64Old)
2297 , "m" (*pu64)
2298 : "cc");
2299 return (bool)u8Ret;
2300# else
2301 bool fRet;
2302 __asm
2303 {
2304 mov rdx, [pu32]
2305 mov rax, [u64Old]
2306 mov rcx, [u64New]
2307 lock cmpxchg [rdx], rcx
2308 mov rdx, [pu64Old]
2309 mov [rdx], rax
2310 setz al
2311 mov [fRet], al
2312 }
2313 return fRet;
2314# endif
2315
2316# elif defined(RT_ARCH_X86)
2317# if RT_INLINE_ASM_GNU_STYLE
2318 uint64_t u64Ret;
2319# if defined(PIC) || defined(__PIC__)
2320 /* Note #1: This code uses a memory clobber description, because the clean
2321 solution with an output value for *pu64 makes gcc run out of
2322 registers. This will cause suboptimal code, and anyone with a
2323 better solution is welcome to improve this.
2324
2325 Note #2: We must prevent gcc from encoding the memory access, as it
2326 may go via the GOT if we're working on a global variable (like
2327 in the testcase). Thus we request a register (%3) and
2328 dereference it ourselves. */
2329 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2330 "lock; cmpxchg8b (%3)\n\t"
2331 "xchgl %%ebx, %1\n\t"
2332 : "=A" (u64Ret)
2333 : "DS" ((uint32_t)u64New)
2334 , "c" ((uint32_t)(u64New >> 32))
2335 , "r" (pu64) /* Do not use "m" here*/
2336 , "0" (u64Old)
2337 : "memory"
2338 , "cc" );
2339# else /* !PIC */
2340 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2341 : "=A" (u64Ret)
2342 , "=m" (*pu64)
2343 : "b" ((uint32_t)u64New)
2344 , "c" ((uint32_t)(u64New >> 32))
2345 , "m" (*pu64)
2346 , "0" (u64Old)
2347 : "cc");
2348# endif
2349 *pu64Old = u64Ret;
2350 return u64Ret == u64Old;
2351# else
2352 uint32_t u32Ret;
2353 __asm
2354 {
2355 mov ebx, dword ptr [u64New]
2356 mov ecx, dword ptr [u64New + 4]
2357 mov edi, [pu64]
2358 mov eax, dword ptr [u64Old]
2359 mov edx, dword ptr [u64Old + 4]
2360 lock cmpxchg8b [edi]
2361 mov ebx, [pu64Old]
2362 mov [ebx], eax
2363 setz al
2364 movzx eax, al
2365 add ebx, 4
2366 mov [ebx], edx
2367 mov dword ptr [u32Ret], eax
2368 }
2369 return !!u32Ret;
2370# endif
2371
2372# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2373 union { uint32_t u; bool f; } fXchg;
2374 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2375 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2376# if defined(RTASM_ARM64_USE_FEAT_LSE)
2377 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2378# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2379 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2380# else
2381 RTASM_ARM_DMB_SY
2382 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2383# endif
2384 "cmp %[uOldActual], %[uOldOrg]\n\t"
2385 "cset %w[fXchg], eq\n\t"
2386 : [pMem] "+Q" (*pu64)
2387 , [uOldActual] "=&r" (*pu64Old)
2388 , [fXchg] "=&r" (fXchg.u)
2389 : [uNew] "r" (u64New)
2390 , [uOldOrg] "r" (u64Old)
2391 , "[uOldActual]" (u64Old)
2392 : "cc");
2393# else
2394 uint64_t u64ActualOld;
2395 uint32_t rcSpill;
2396 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2397 RTASM_ARM_DMB_SY
2398# if defined(RT_ARCH_ARM64)
2399 "ldaxr %[uOld], %[pMem]\n\t"
2400 "cmp %[uOld], %[uCmp]\n\t"
2401 "bne 1f\n\t" /* stop here if not equal */
2402 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2403 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2404 "mov %w[fXchg], #1\n\t"
2405 "1:\n\t"
2406 "clrex\n\t"
2407# else
2408 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2409 "teq %[uOld], %[uCmp]\n\t"
2410 "teqeq %H[uOld], %H[uCmp]\n\t"
2411 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2412 "bne 1f\n\t" /* stop here if not equal */
2413 "cmp %[rc], #0\n\t"
2414 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2415 "mov %[fXchg], #1\n\t"
2416 "1:\n\t"
2417 /** @todo clrexne on armv7? */
2418# endif
2419 : [pMem] "+Q" (*pu64)
2420 , [uOld] "=&r" (u64ActualOld)
2421 , [rc] "=&r" (rcSpill)
2422 , [fXchg] "=&r" (fXchg.u)
2423 : [uCmp] "r" (u64Old)
2424 , [uNew] "r" (u64New)
2425 , "[fXchg]" (0)
2426 RTASM_ARM_DMB_SY_COMMA_IN_REG
2427 : "cc");
2428 *pu64Old = u64ActualOld;
2429# endif
2430 return fXchg.f;
2431
2432# else
2433# error "Port me"
2434# endif
2435}
2436#endif
2437
2438
2439/**
2440 * Atomically Compare and exchange a signed 64-bit value, additionally
2441 * passing back old value, ordered.
2442 *
2443 * @returns true if xchg was done.
2444 * @returns false if xchg wasn't done.
2445 *
2446 * @param pi64 Pointer to the 64-bit variable to update.
2447 * @param i64 The 64-bit value to assign to *pu64.
2448 * @param i64Old The value to compare with.
2449 * @param pi64Old Pointer store the old value at.
2450 *
2451 * @remarks x86: Requires a Pentium or later.
2452 */
2453DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2454{
2455 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2456}
2457
2458#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2459
2460/** @def RTASM_HAVE_CMP_XCHG_U128
2461 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2462 * and ASMAtomicCmpSwapExU128() available. */
2463# define RTASM_HAVE_CMP_XCHG_U128 1
2464
2465
2466/**
2467 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2468 *
2469 * @returns true if exchange was done.
2470 * @returns false if exchange wasn't done.
2471 *
2472 * @param pu128 Pointer to the 128-bit variable to update.
2473 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2474 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2475 * @param u64OldHi The high 64-bit of the value to compare with.
2476 * @param u64OldLo The low 64-bit of the value to compare with.
2477 * @param pu128Old Where to return the old value.
2478 *
2479 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2480 */
2481# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2482DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2483 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2484# else
2485DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2486 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2487{
2488# if RT_INLINE_ASM_USES_INTRIN
2489 pu128Old->Hi = u64OldHi;
2490 pu128Old->Lo = u64OldLo;
2491 AssertCompileMemberOffset(uint128_t, Lo, 0);
2492 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2493
2494# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2495 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2496 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2497 *pu128Old = uOld;
2498 return uCmp == uOld;
2499
2500# elif defined(RT_ARCH_AMD64)
2501# if RT_INLINE_ASM_GNU_STYLE
2502 uint8_t bRet;
2503 uint64_t u64RetHi, u64RetLo;
2504 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2505 "setz %b0\n\t"
2506 : "=r" (bRet)
2507 , "=a" (u64RetLo)
2508 , "=d" (u64RetHi)
2509 , "+m" (*pu128)
2510 : "a" (u64OldLo)
2511 , "d" (u64OldHi)
2512 , "b" (u64NewLo)
2513 , "c" (u64NewHi)
2514 : "cc");
2515 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2516 return (bool)bRet;
2517# else
2518# error "Port me"
2519# endif
2520# else
2521# error "Port me"
2522# endif
2523}
2524# endif
2525
2526
2527/**
2528 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2529 *
2530 * @returns true if exchange was done.
2531 * @returns false if exchange wasn't done.
2532 *
2533 * @param pu128 Pointer to the 128-bit variable to update.
2534 * @param u128New The 128-bit value to assign to *pu128.
2535 * @param u128Old The value to compare with.
2536 * @param pu128Old Where to return the old value.
2537 *
2538 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2539 */
2540DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2541 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2542{
2543# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2544# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2545 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2546 *pu128Old = uSwapped;
2547 return uSwapped == u128Old;
2548# else
2549 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2550 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2551# endif
2552# else
2553 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2554# endif
2555}
2556
2557
2558/**
2559 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2560 */
2561DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2562 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2563{
2564# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2565 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2566# else
2567 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2568# endif
2569}
2570
2571#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2572
2573
2574
2575/** @def ASMAtomicCmpXchgExHandle
2576 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2577 *
2578 * @param ph Pointer to the value to update.
2579 * @param hNew The new value to assigned to *pu.
2580 * @param hOld The old value to *pu compare with.
2581 * @param fRc Where to store the result.
2582 * @param phOldVal Pointer to where to store the old value.
2583 *
2584 * @remarks This doesn't currently work for all handles (like RTFILE).
2585 */
2586#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2587# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2588 do { \
2589 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2590 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2591 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2592 } while (0)
2593#elif HC_ARCH_BITS == 64
2594# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2595 do { \
2596 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2597 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2598 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2599 } while (0)
2600#else
2601# error HC_ARCH_BITS
2602#endif
2603
2604
2605/** @def ASMAtomicCmpXchgExSize
2606 * Atomically Compare and Exchange a value which size might differ
2607 * between platforms or compilers. Additionally passes back old value.
2608 *
2609 * @param pu Pointer to the value to update.
2610 * @param uNew The new value to assigned to *pu.
2611 * @param uOld The old value to *pu compare with.
2612 * @param fRc Where to store the result.
2613 * @param puOldVal Pointer to where to store the old value.
2614 *
2615 * @remarks x86: Requires a 486 or later.
2616 */
2617#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2618 do { \
2619 switch (sizeof(*(pu))) { \
2620 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2621 break; \
2622 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2623 break; \
2624 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2625 (fRc) = false; \
2626 (uOldVal) = 0; \
2627 break; \
2628 } \
2629 } while (0)
2630
2631
2632/**
2633 * Atomically Compare and Exchange a pointer value, additionally
2634 * passing back old value, ordered.
2635 *
2636 * @returns true if xchg was done.
2637 * @returns false if xchg wasn't done.
2638 *
2639 * @param ppv Pointer to the value to update.
2640 * @param pvNew The new value to assigned to *ppv.
2641 * @param pvOld The old value to *ppv compare with.
2642 * @param ppvOld Pointer store the old value at.
2643 *
2644 * @remarks x86: Requires a 486 or later.
2645 */
2646DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2647 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2648{
2649#if ARCH_BITS == 32 || ARCH_BITS == 16
2650 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2651#elif ARCH_BITS == 64
2652 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2653#else
2654# error "ARCH_BITS is bogus"
2655#endif
2656}
2657
2658
2659/**
2660 * Atomically Compare and Exchange a pointer value, additionally
2661 * passing back old value, ordered.
2662 *
2663 * @returns true if xchg was done.
2664 * @returns false if xchg wasn't done.
2665 *
2666 * @param ppv Pointer to the value to update.
2667 * @param pvNew The new value to assigned to *ppv.
2668 * @param pvOld The old value to *ppv compare with.
2669 * @param ppvOld Pointer store the old value at.
2670 *
2671 * @remarks This is relatively type safe on GCC platforms.
2672 * @remarks x86: Requires a 486 or later.
2673 */
2674#ifdef __GNUC__
2675# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2676 __extension__ \
2677 ({\
2678 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2679 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2680 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2681 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2682 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2683 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2684 (void **)ppvOldTypeChecked); \
2685 fMacroRet; \
2686 })
2687#else
2688# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2689 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2690#endif
2691
2692
2693/**
2694 * Virtualization unfriendly serializing instruction, always exits.
2695 */
2696#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2697RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2698#else
2699DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2700{
2701# if RT_INLINE_ASM_GNU_STYLE
2702 RTCCUINTREG xAX = 0;
2703# ifdef RT_ARCH_AMD64
2704 __asm__ __volatile__ ("cpuid"
2705 : "=a" (xAX)
2706 : "0" (xAX)
2707 : "rbx", "rcx", "rdx", "memory");
2708# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2709 __asm__ __volatile__ ("push %%ebx\n\t"
2710 "cpuid\n\t"
2711 "pop %%ebx\n\t"
2712 : "=a" (xAX)
2713 : "0" (xAX)
2714 : "ecx", "edx", "memory");
2715# else
2716 __asm__ __volatile__ ("cpuid"
2717 : "=a" (xAX)
2718 : "0" (xAX)
2719 : "ebx", "ecx", "edx", "memory");
2720# endif
2721
2722# elif RT_INLINE_ASM_USES_INTRIN
2723 int aInfo[4];
2724 _ReadWriteBarrier();
2725 __cpuid(aInfo, 0);
2726
2727# else
2728 __asm
2729 {
2730 push ebx
2731 xor eax, eax
2732 cpuid
2733 pop ebx
2734 }
2735# endif
2736}
2737#endif
2738
2739/**
2740 * Virtualization friendly serializing instruction, though more expensive.
2741 */
2742#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2743RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2744#else
2745DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2746{
2747# if RT_INLINE_ASM_GNU_STYLE
2748# ifdef RT_ARCH_AMD64
2749 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2750 "subq $128, %%rsp\n\t" /*redzone*/
2751 "mov %%ss, %%eax\n\t"
2752 "pushq %%rax\n\t"
2753 "pushq %%r10\n\t"
2754 "pushfq\n\t"
2755 "movl %%cs, %%eax\n\t"
2756 "pushq %%rax\n\t"
2757 "leaq 1f(%%rip), %%rax\n\t"
2758 "pushq %%rax\n\t"
2759 "iretq\n\t"
2760 "1:\n\t"
2761 ::: "rax", "r10", "memory", "cc");
2762# else
2763 __asm__ __volatile__ ("pushfl\n\t"
2764 "pushl %%cs\n\t"
2765 "pushl $1f\n\t"
2766 "iretl\n\t"
2767 "1:\n\t"
2768 ::: "memory");
2769# endif
2770
2771# else
2772 __asm
2773 {
2774 pushfd
2775 push cs
2776 push la_ret
2777 iretd
2778 la_ret:
2779 }
2780# endif
2781}
2782#endif
2783
2784/**
2785 * Virtualization friendlier serializing instruction, may still cause exits.
2786 */
2787#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2788RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2789#else
2790DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2791{
2792# if RT_INLINE_ASM_GNU_STYLE
2793 /* rdtscp is not supported by ancient linux build VM of course :-( */
2794# ifdef RT_ARCH_AMD64
2795 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2796 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2797# else
2798 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2799 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2800# endif
2801# else
2802# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2803 uint32_t uIgnore;
2804 _ReadWriteBarrier();
2805 (void)__rdtscp(&uIgnore);
2806 (void)uIgnore;
2807# else
2808 __asm
2809 {
2810 rdtscp
2811 }
2812# endif
2813# endif
2814}
2815#endif
2816
2817
2818/**
2819 * Serialize Instruction (both data store and instruction flush).
2820 */
2821#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2822# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2823#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2824# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
2825#elif defined(RT_ARCH_SPARC64)
2826RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
2827#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2828DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
2829{
2830 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
2831}
2832#else
2833# error "Port me"
2834#endif
2835
2836
2837/**
2838 * Memory fence, waits for any pending writes and reads to complete.
2839 * @note No implicit compiler barrier (which is probably stupid).
2840 */
2841DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
2842{
2843#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2844# if RT_INLINE_ASM_GNU_STYLE
2845 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
2846# elif RT_INLINE_ASM_USES_INTRIN
2847 _mm_mfence();
2848# else
2849 __asm
2850 {
2851 _emit 0x0f
2852 _emit 0xae
2853 _emit 0xf0
2854 }
2855# endif
2856#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2857 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
2858#elif ARCH_BITS == 16
2859 uint16_t volatile u16;
2860 ASMAtomicXchgU16(&u16, 0);
2861#else
2862 uint32_t volatile u32;
2863 ASMAtomicXchgU32(&u32, 0);
2864#endif
2865}
2866
2867
2868/**
2869 * Write fence, waits for any pending writes to complete.
2870 * @note No implicit compiler barrier (which is probably stupid).
2871 */
2872DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
2873{
2874#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2875# if RT_INLINE_ASM_GNU_STYLE
2876 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
2877# elif RT_INLINE_ASM_USES_INTRIN
2878 _mm_sfence();
2879# else
2880 __asm
2881 {
2882 _emit 0x0f
2883 _emit 0xae
2884 _emit 0xf8
2885 }
2886# endif
2887#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2888 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
2889#else
2890 ASMMemoryFence();
2891#endif
2892}
2893
2894
2895/**
2896 * Read fence, waits for any pending reads to complete.
2897 * @note No implicit compiler barrier (which is probably stupid).
2898 */
2899DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
2900{
2901#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2902# if RT_INLINE_ASM_GNU_STYLE
2903 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
2904# elif RT_INLINE_ASM_USES_INTRIN
2905 _mm_lfence();
2906# else
2907 __asm
2908 {
2909 _emit 0x0f
2910 _emit 0xae
2911 _emit 0xe8
2912 }
2913# endif
2914#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2915 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
2916#else
2917 ASMMemoryFence();
2918#endif
2919}
2920
2921
2922/**
2923 * Atomically reads an unsigned 8-bit value, ordered.
2924 *
2925 * @returns Current *pu8 value
2926 * @param pu8 Pointer to the 8-bit variable to read.
2927 */
2928DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2929{
2930#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2931 uint32_t u32;
2932# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
2933 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2934 RTASM_ARM_DMB_SY
2935 "casab %w[uDst], wzr, %[pMem]\n\t"
2936 : [uDst] "=&r" (u32)
2937 : [pMem] "Q" (*pu8),
2938 "0" (0)
2939 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2940# else
2941 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2942 RTASM_ARM_DMB_SY
2943# if defined(RT_ARCH_ARM64)
2944# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
2945 "ldurb %w[uDst], %[pMem]\n\t"
2946# else
2947 "ldxrb %w[uDst], %[pMem]\n\t"
2948 "clrex\n\t"
2949# endif
2950# else
2951 "ldrexb %[uDst], %[pMem]\n\t"
2952 /** @todo clrex */
2953# endif
2954 : [uDst] "=&r" (u32)
2955 : [pMem] "Q" (*pu8)
2956 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2957# endif
2958 return (uint8_t)u32;
2959#else
2960 ASMMemoryFence();
2961 return *pu8; /* byte reads are atomic on x86 */
2962#endif
2963}
2964
2965
2966/**
2967 * Atomically reads an unsigned 8-bit value, unordered.
2968 *
2969 * @returns Current *pu8 value
2970 * @param pu8 Pointer to the 8-bit variable to read.
2971 */
2972DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2973{
2974#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2975 uint32_t u32;
2976 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
2977# if defined(RT_ARCH_ARM64)
2978 "ldurb %w[uDst], %[pMem]\n\t"
2979# else
2980 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
2981# endif
2982 : [uDst] "=&r" (u32)
2983 : [pMem] "Q" (*pu8));
2984 return (uint8_t)u32;
2985#else
2986 return *pu8; /* byte reads are atomic on x86 */
2987#endif
2988}
2989
2990
2991/**
2992 * Atomically reads a signed 8-bit value, ordered.
2993 *
2994 * @returns Current *pi8 value
2995 * @param pi8 Pointer to the 8-bit variable to read.
2996 */
2997DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
2998{
2999#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3000 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3001#else
3002 ASMMemoryFence();
3003 return *pi8; /* byte reads are atomic on x86 */
3004#endif
3005}
3006
3007
3008/**
3009 * Atomically reads a signed 8-bit value, unordered.
3010 *
3011 * @returns Current *pi8 value
3012 * @param pi8 Pointer to the 8-bit variable to read.
3013 */
3014DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3015{
3016#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3017 int32_t i32;
3018 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3019# if defined(RT_ARCH_ARM64)
3020 "ldurb %w[iDst], %[pMem]\n\t"
3021# else
3022 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3023# endif
3024 : [iDst] "=&r" (i32)
3025 : [pMem] "Q" (*pi8));
3026 return (int8_t)i32;
3027#else
3028 return *pi8; /* byte reads are atomic on x86 */
3029#endif
3030}
3031
3032
3033/**
3034 * Atomically reads an unsigned 16-bit value, ordered.
3035 *
3036 * @returns Current *pu16 value
3037 * @param pu16 Pointer to the 16-bit variable to read.
3038 */
3039DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3040{
3041 Assert(!((uintptr_t)pu16 & 1));
3042#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3043 uint32_t u32;
3044# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3045 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3046 RTASM_ARM_DMB_SY
3047 "casah %w[uDst], wzr, %[pMem]\n\t"
3048 : [uDst] "=&r" (u32)
3049 : [pMem] "Q" (*pu16),
3050 "0" (0)
3051 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3052# else
3053 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3054 RTASM_ARM_DMB_SY
3055# if defined(RT_ARCH_ARM64)
3056# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3057 "ldurh %w[uDst], %[pMem]\n\t"
3058# else
3059 "ldxrh %w[uDst], %[pMem]\n\t"
3060 "clrex\n\t"
3061# endif
3062# else
3063 "ldrexh %[uDst], %[pMem]\n\t"
3064 /** @todo clrex */
3065# endif
3066 : [uDst] "=&r" (u32)
3067 : [pMem] "Q" (*pu16)
3068 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3069# endif
3070 return (uint16_t)u32;
3071#else
3072 ASMMemoryFence();
3073 return *pu16;
3074#endif
3075}
3076
3077
3078/**
3079 * Atomically reads an unsigned 16-bit value, unordered.
3080 *
3081 * @returns Current *pu16 value
3082 * @param pu16 Pointer to the 16-bit variable to read.
3083 */
3084DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3085{
3086 Assert(!((uintptr_t)pu16 & 1));
3087#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3088 uint32_t u32;
3089 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3090# if defined(RT_ARCH_ARM64)
3091 "ldurh %w[uDst], %[pMem]\n\t"
3092# else
3093 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3094# endif
3095 : [uDst] "=&r" (u32)
3096 : [pMem] "Q" (*pu16));
3097 return (uint16_t)u32;
3098#else
3099 return *pu16;
3100#endif
3101}
3102
3103
3104/**
3105 * Atomically reads a signed 16-bit value, ordered.
3106 *
3107 * @returns Current *pi16 value
3108 * @param pi16 Pointer to the 16-bit variable to read.
3109 */
3110DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3111{
3112 Assert(!((uintptr_t)pi16 & 1));
3113#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3114 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3115#else
3116 ASMMemoryFence();
3117 return *pi16;
3118#endif
3119}
3120
3121
3122/**
3123 * Atomically reads a signed 16-bit value, unordered.
3124 *
3125 * @returns Current *pi16 value
3126 * @param pi16 Pointer to the 16-bit variable to read.
3127 */
3128DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3129{
3130 Assert(!((uintptr_t)pi16 & 1));
3131#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3132 int32_t i32;
3133 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3134# if defined(RT_ARCH_ARM64)
3135 "ldurh %w[iDst], %[pMem]\n\t"
3136# else
3137 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3138# endif
3139 : [iDst] "=&r" (i32)
3140 : [pMem] "Q" (*pi16));
3141 return (int16_t)i32;
3142#else
3143 return *pi16;
3144#endif
3145}
3146
3147
3148/**
3149 * Atomically reads an unsigned 32-bit value, ordered.
3150 *
3151 * @returns Current *pu32 value
3152 * @param pu32 Pointer to the 32-bit variable to read.
3153 */
3154DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3155{
3156 Assert(!((uintptr_t)pu32 & 3));
3157#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3158 uint32_t u32;
3159# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3160 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3161 RTASM_ARM_DMB_SY
3162 "casa %w[uDst], wzr, %[pMem]\n\t"
3163 : [uDst] "=&r" (u32)
3164 : [pMem] "Q" (*pu32),
3165 "0" (0)
3166 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3167# else
3168 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3169 RTASM_ARM_DMB_SY
3170# if defined(RT_ARCH_ARM64)
3171# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3172 "ldur %w[uDst], %[pMem]\n\t"
3173# else
3174 "ldxr %w[uDst], %[pMem]\n\t"
3175 "clrex\n\t"
3176# endif
3177# else
3178 "ldrex %[uDst], %[pMem]\n\t"
3179 /** @todo clrex */
3180# endif
3181 : [uDst] "=&r" (u32)
3182 : [pMem] "Q" (*pu32)
3183 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3184# endif
3185 return u32;
3186#else
3187 ASMMemoryFence();
3188# if ARCH_BITS == 16
3189 AssertFailed(); /** @todo 16-bit */
3190# endif
3191 return *pu32;
3192#endif
3193}
3194
3195
3196/**
3197 * Atomically reads an unsigned 32-bit value, unordered.
3198 *
3199 * @returns Current *pu32 value
3200 * @param pu32 Pointer to the 32-bit variable to read.
3201 */
3202DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3203{
3204 Assert(!((uintptr_t)pu32 & 3));
3205#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3206 uint32_t u32;
3207 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3208# if defined(RT_ARCH_ARM64)
3209 "ldur %w[uDst], %[pMem]\n\t"
3210# else
3211 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3212# endif
3213 : [uDst] "=&r" (u32)
3214 : [pMem] "Q" (*pu32));
3215 return u32;
3216#else
3217# if ARCH_BITS == 16
3218 AssertFailed(); /** @todo 16-bit */
3219# endif
3220 return *pu32;
3221#endif
3222}
3223
3224
3225/**
3226 * Atomically reads a signed 32-bit value, ordered.
3227 *
3228 * @returns Current *pi32 value
3229 * @param pi32 Pointer to the 32-bit variable to read.
3230 */
3231DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3232{
3233 Assert(!((uintptr_t)pi32 & 3));
3234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3235 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3236#else
3237 ASMMemoryFence();
3238# if ARCH_BITS == 16
3239 AssertFailed(); /** @todo 16-bit */
3240# endif
3241 return *pi32;
3242#endif
3243}
3244
3245
3246/**
3247 * Atomically reads a signed 32-bit value, unordered.
3248 *
3249 * @returns Current *pi32 value
3250 * @param pi32 Pointer to the 32-bit variable to read.
3251 */
3252DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3253{
3254 Assert(!((uintptr_t)pi32 & 3));
3255#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3256 int32_t i32;
3257 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3258# if defined(RT_ARCH_ARM64)
3259 "ldur %w[iDst], %[pMem]\n\t"
3260# else
3261 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3262# endif
3263 : [iDst] "=&r" (i32)
3264 : [pMem] "Q" (*pi32));
3265 return i32;
3266
3267#else
3268# if ARCH_BITS == 16
3269 AssertFailed(); /** @todo 16-bit */
3270# endif
3271 return *pi32;
3272#endif
3273}
3274
3275
3276/**
3277 * Atomically reads an unsigned 64-bit value, ordered.
3278 *
3279 * @returns Current *pu64 value
3280 * @param pu64 Pointer to the 64-bit variable to read.
3281 * The memory pointed to must be writable.
3282 *
3283 * @remarks This may fault if the memory is read-only!
3284 * @remarks x86: Requires a Pentium or later.
3285 */
3286#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3287 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3288RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3289#else
3290DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3291{
3292 uint64_t u64;
3293# ifdef RT_ARCH_AMD64
3294 Assert(!((uintptr_t)pu64 & 7));
3295/*# if RT_INLINE_ASM_GNU_STYLE
3296 __asm__ __volatile__( "mfence\n\t"
3297 "movq %1, %0\n\t"
3298 : "=r" (u64)
3299 : "m" (*pu64));
3300# else
3301 __asm
3302 {
3303 mfence
3304 mov rdx, [pu64]
3305 mov rax, [rdx]
3306 mov [u64], rax
3307 }
3308# endif*/
3309 ASMMemoryFence();
3310 u64 = *pu64;
3311
3312# elif defined(RT_ARCH_X86)
3313# if RT_INLINE_ASM_GNU_STYLE
3314# if defined(PIC) || defined(__PIC__)
3315 uint32_t u32EBX = 0;
3316 Assert(!((uintptr_t)pu64 & 7));
3317 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3318 "lock; cmpxchg8b (%5)\n\t"
3319 "movl %3, %%ebx\n\t"
3320 : "=A" (u64)
3321# if RT_GNUC_PREREQ(4, 3)
3322 , "+m" (*pu64)
3323# else
3324 , "=m" (*pu64)
3325# endif
3326 : "0" (0ULL)
3327 , "m" (u32EBX)
3328 , "c" (0)
3329 , "S" (pu64)
3330 : "cc");
3331# else /* !PIC */
3332 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3333 : "=A" (u64)
3334 , "+m" (*pu64)
3335 : "0" (0ULL)
3336 , "b" (0)
3337 , "c" (0)
3338 : "cc");
3339# endif
3340# else
3341 Assert(!((uintptr_t)pu64 & 7));
3342 __asm
3343 {
3344 xor eax, eax
3345 xor edx, edx
3346 mov edi, pu64
3347 xor ecx, ecx
3348 xor ebx, ebx
3349 lock cmpxchg8b [edi]
3350 mov dword ptr [u64], eax
3351 mov dword ptr [u64 + 4], edx
3352 }
3353# endif
3354
3355# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3356 Assert(!((uintptr_t)pu64 & 7));
3357
3358# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3359 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3360 RTASM_ARM_DMB_SY
3361 "casa %[uDst], xzr, %[pMem]\n\t"
3362 : [uDst] "=&r" (u64)
3363 : [pMem] "Q" (*pu64),
3364 "0" (0)
3365 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3366# else
3367 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3368 RTASM_ARM_DMB_SY
3369# if defined(RT_ARCH_ARM64)
3370# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3371 "ldur %[uDst], %[pMem]\n\t"
3372# else
3373 "ldxr %[uDst], %[pMem]\n\t"
3374 "clrex\n\t"
3375# endif
3376# else
3377 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3378 /** @todo clrex */
3379# endif
3380 : [uDst] "=&r" (u64)
3381 : [pMem] "Q" (*pu64)
3382 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3383# endif
3384# else
3385# error "Port me"
3386# endif
3387 return u64;
3388}
3389#endif
3390
3391
3392/**
3393 * Atomically reads an unsigned 64-bit value, unordered.
3394 *
3395 * @returns Current *pu64 value
3396 * @param pu64 Pointer to the 64-bit variable to read.
3397 * The memory pointed to must be writable.
3398 *
3399 * @remarks This may fault if the memory is read-only!
3400 * @remarks x86: Requires a Pentium or later.
3401 */
3402#if !defined(RT_ARCH_AMD64) \
3403 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3404 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3405RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3406#else
3407DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3408{
3409 uint64_t u64;
3410# ifdef RT_ARCH_AMD64
3411 Assert(!((uintptr_t)pu64 & 7));
3412/*# if RT_INLINE_ASM_GNU_STYLE
3413 Assert(!((uintptr_t)pu64 & 7));
3414 __asm__ __volatile__("movq %1, %0\n\t"
3415 : "=r" (u64)
3416 : "m" (*pu64));
3417# else
3418 __asm
3419 {
3420 mov rdx, [pu64]
3421 mov rax, [rdx]
3422 mov [u64], rax
3423 }
3424# endif */
3425 u64 = *pu64;
3426
3427# elif defined(RT_ARCH_X86)
3428# if RT_INLINE_ASM_GNU_STYLE
3429# if defined(PIC) || defined(__PIC__)
3430 uint32_t u32EBX = 0;
3431 uint32_t u32Spill;
3432 Assert(!((uintptr_t)pu64 & 7));
3433 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3434 "xor %%ecx,%%ecx\n\t"
3435 "xor %%edx,%%edx\n\t"
3436 "xchgl %%ebx, %3\n\t"
3437 "lock; cmpxchg8b (%4)\n\t"
3438 "movl %3, %%ebx\n\t"
3439 : "=A" (u64)
3440# if RT_GNUC_PREREQ(4, 3)
3441 , "+m" (*pu64)
3442# else
3443 , "=m" (*pu64)
3444# endif
3445 , "=c" (u32Spill)
3446 : "m" (u32EBX)
3447 , "S" (pu64)
3448 : "cc");
3449# else /* !PIC */
3450 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3451 : "=A" (u64)
3452 , "+m" (*pu64)
3453 : "0" (0ULL)
3454 , "b" (0)
3455 , "c" (0)
3456 : "cc");
3457# endif
3458# else
3459 Assert(!((uintptr_t)pu64 & 7));
3460 __asm
3461 {
3462 xor eax, eax
3463 xor edx, edx
3464 mov edi, pu64
3465 xor ecx, ecx
3466 xor ebx, ebx
3467 lock cmpxchg8b [edi]
3468 mov dword ptr [u64], eax
3469 mov dword ptr [u64 + 4], edx
3470 }
3471# endif
3472
3473# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3474 Assert(!((uintptr_t)pu64 & 7));
3475 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3476# if defined(RT_ARCH_ARM64)
3477 "ldur %[uDst], %[pMem]\n\t"
3478# else
3479 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3480 /** @todo clrex? */
3481# endif
3482 : [uDst] "=&r" (u64)
3483 : [pMem] "Q" (*pu64));
3484
3485# else
3486# error "Port me"
3487# endif
3488 return u64;
3489}
3490#endif
3491
3492
3493/**
3494 * Atomically reads a signed 64-bit value, ordered.
3495 *
3496 * @returns Current *pi64 value
3497 * @param pi64 Pointer to the 64-bit variable to read.
3498 * The memory pointed to must be writable.
3499 *
3500 * @remarks This may fault if the memory is read-only!
3501 * @remarks x86: Requires a Pentium or later.
3502 */
3503DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3504{
3505 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3506}
3507
3508
3509/**
3510 * Atomically reads a signed 64-bit value, unordered.
3511 *
3512 * @returns Current *pi64 value
3513 * @param pi64 Pointer to the 64-bit variable to read.
3514 * The memory pointed to must be writable.
3515 *
3516 * @remarks This will fault if the memory is read-only!
3517 * @remarks x86: Requires a Pentium or later.
3518 */
3519DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3520{
3521 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3522}
3523
3524
3525/**
3526 * Atomically reads a size_t value, ordered.
3527 *
3528 * @returns Current *pcb value
3529 * @param pcb Pointer to the size_t variable to read.
3530 */
3531DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3532{
3533#if ARCH_BITS == 64
3534 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3535#elif ARCH_BITS == 32
3536 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3537#elif ARCH_BITS == 16
3538 AssertCompileSize(size_t, 2);
3539 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3540#else
3541# error "Unsupported ARCH_BITS value"
3542#endif
3543}
3544
3545
3546/**
3547 * Atomically reads a size_t value, unordered.
3548 *
3549 * @returns Current *pcb value
3550 * @param pcb Pointer to the size_t variable to read.
3551 */
3552DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3553{
3554#if ARCH_BITS == 64 || ARCH_BITS == 16
3555 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3556#elif ARCH_BITS == 32
3557 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3558#elif ARCH_BITS == 16
3559 AssertCompileSize(size_t, 2);
3560 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3561#else
3562# error "Unsupported ARCH_BITS value"
3563#endif
3564}
3565
3566
3567/**
3568 * Atomically reads a pointer value, ordered.
3569 *
3570 * @returns Current *pv value
3571 * @param ppv Pointer to the pointer variable to read.
3572 *
3573 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3574 * requires less typing (no casts).
3575 */
3576DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3577{
3578#if ARCH_BITS == 32 || ARCH_BITS == 16
3579 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3580#elif ARCH_BITS == 64
3581 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3582#else
3583# error "ARCH_BITS is bogus"
3584#endif
3585}
3586
3587/**
3588 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3589 *
3590 * @returns Current *pv value
3591 * @param ppv Pointer to the pointer variable to read.
3592 * @param Type The type of *ppv, sans volatile.
3593 */
3594#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3595# define ASMAtomicReadPtrT(ppv, Type) \
3596 __extension__ \
3597 ({\
3598 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
3599 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
3600 pvTypeChecked; \
3601 })
3602#else
3603# define ASMAtomicReadPtrT(ppv, Type) \
3604 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3605#endif
3606
3607
3608/**
3609 * Atomically reads a pointer value, unordered.
3610 *
3611 * @returns Current *pv value
3612 * @param ppv Pointer to the pointer variable to read.
3613 *
3614 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
3615 * requires less typing (no casts).
3616 */
3617DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3618{
3619#if ARCH_BITS == 32 || ARCH_BITS == 16
3620 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3621#elif ARCH_BITS == 64
3622 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3623#else
3624# error "ARCH_BITS is bogus"
3625#endif
3626}
3627
3628
3629/**
3630 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
3631 *
3632 * @returns Current *pv value
3633 * @param ppv Pointer to the pointer variable to read.
3634 * @param Type The type of *ppv, sans volatile.
3635 */
3636#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3637# define ASMAtomicUoReadPtrT(ppv, Type) \
3638 __extension__ \
3639 ({\
3640 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
3641 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
3642 pvTypeChecked; \
3643 })
3644#else
3645# define ASMAtomicUoReadPtrT(ppv, Type) \
3646 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3647#endif
3648
3649
3650/**
3651 * Atomically reads a boolean value, ordered.
3652 *
3653 * @returns Current *pf value
3654 * @param pf Pointer to the boolean variable to read.
3655 */
3656DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3657{
3658 ASMMemoryFence();
3659 return *pf; /* byte reads are atomic on x86 */
3660}
3661
3662
3663/**
3664 * Atomically reads a boolean value, unordered.
3665 *
3666 * @returns Current *pf value
3667 * @param pf Pointer to the boolean variable to read.
3668 */
3669DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3670{
3671 return *pf; /* byte reads are atomic on x86 */
3672}
3673
3674
3675/**
3676 * Atomically read a typical IPRT handle value, ordered.
3677 *
3678 * @param ph Pointer to the handle variable to read.
3679 * @param phRes Where to store the result.
3680 *
3681 * @remarks This doesn't currently work for all handles (like RTFILE).
3682 */
3683#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3684# define ASMAtomicReadHandle(ph, phRes) \
3685 do { \
3686 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3687 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3688 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
3689 } while (0)
3690#elif HC_ARCH_BITS == 64
3691# define ASMAtomicReadHandle(ph, phRes) \
3692 do { \
3693 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3694 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3695 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
3696 } while (0)
3697#else
3698# error HC_ARCH_BITS
3699#endif
3700
3701
3702/**
3703 * Atomically read a typical IPRT handle value, unordered.
3704 *
3705 * @param ph Pointer to the handle variable to read.
3706 * @param phRes Where to store the result.
3707 *
3708 * @remarks This doesn't currently work for all handles (like RTFILE).
3709 */
3710#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3711# define ASMAtomicUoReadHandle(ph, phRes) \
3712 do { \
3713 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3714 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3715 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
3716 } while (0)
3717#elif HC_ARCH_BITS == 64
3718# define ASMAtomicUoReadHandle(ph, phRes) \
3719 do { \
3720 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3721 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3722 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
3723 } while (0)
3724#else
3725# error HC_ARCH_BITS
3726#endif
3727
3728
3729/**
3730 * Atomically read a value which size might differ
3731 * between platforms or compilers, ordered.
3732 *
3733 * @param pu Pointer to the variable to read.
3734 * @param puRes Where to store the result.
3735 */
3736#define ASMAtomicReadSize(pu, puRes) \
3737 do { \
3738 switch (sizeof(*(pu))) { \
3739 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3740 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3741 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3742 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3743 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3744 } \
3745 } while (0)
3746
3747
3748/**
3749 * Atomically read a value which size might differ
3750 * between platforms or compilers, unordered.
3751 *
3752 * @param pu Pointer to the variable to read.
3753 * @param puRes Where to store the result.
3754 */
3755#define ASMAtomicUoReadSize(pu, puRes) \
3756 do { \
3757 switch (sizeof(*(pu))) { \
3758 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3759 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3760 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3761 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3762 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3763 } \
3764 } while (0)
3765
3766
3767/**
3768 * Atomically writes an unsigned 8-bit value, ordered.
3769 *
3770 * @param pu8 Pointer to the 8-bit variable.
3771 * @param u8 The 8-bit value to assign to *pu8.
3772 */
3773DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3774{
3775#if defined(RT_ARCH_ARM64)
3776 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
3777 as all byte accesses are single-copy atomic, which I think suffices here. */
3778 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
3779# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
3780 RTASM_ARM_DMB_SY
3781 "swpb %w[uValue], wzr, %[pMem]\n\t"
3782# else
3783 RTASM_ARM_DMB_SY
3784 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3785# endif
3786 : [pMem] "+Q" (*pu8)
3787 : [uValue] "r" ((uint32_t)u8)
3788 : );
3789#else
3790 ASMAtomicXchgU8(pu8, u8);
3791#endif
3792}
3793
3794
3795/**
3796 * Atomically writes an unsigned 8-bit value, unordered.
3797 *
3798 * @param pu8 Pointer to the 8-bit variable.
3799 * @param u8 The 8-bit value to assign to *pu8.
3800 */
3801DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3802{
3803 *pu8 = u8; /* byte writes are atomic on x86 */
3804}
3805
3806
3807/**
3808 * Atomically writes a signed 8-bit value, ordered.
3809 *
3810 * @param pi8 Pointer to the 8-bit variable to read.
3811 * @param i8 The 8-bit value to assign to *pi8.
3812 */
3813DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3814{
3815#if defined(RT_ARCH_ARM64)
3816 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
3817#else
3818 ASMAtomicXchgS8(pi8, i8);
3819#endif
3820}
3821
3822
3823/**
3824 * Atomically writes a signed 8-bit value, unordered.
3825 *
3826 * @param pi8 Pointer to the 8-bit variable to write.
3827 * @param i8 The 8-bit value to assign to *pi8.
3828 */
3829DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3830{
3831 *pi8 = i8; /* byte writes are atomic on x86 */
3832}
3833
3834
3835/**
3836 * Atomically writes an unsigned 16-bit value, ordered.
3837 *
3838 * @param pu16 Pointer to the 16-bit variable to write.
3839 * @param u16 The 16-bit value to assign to *pu16.
3840 */
3841DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3842{
3843#if defined(RT_ARCH_ARM64)
3844 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
3845# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3846 RTASM_ARM_DMB_SY
3847 "swph %w[uValue], wzr, %[pMem]\n\t"
3848# else
3849 RTASM_ARM_DMB_SY
3850 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3851# endif
3852 : [pMem] "+Q" (*pu16)
3853 : [uValue] "r" ((uint32_t)u16)
3854 : );
3855#else
3856 ASMAtomicXchgU16(pu16, u16);
3857#endif
3858}
3859
3860
3861/**
3862 * Atomically writes an unsigned 16-bit value, unordered.
3863 *
3864 * @param pu16 Pointer to the 16-bit variable to write.
3865 * @param u16 The 16-bit value to assign to *pu16.
3866 */
3867DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3868{
3869 Assert(!((uintptr_t)pu16 & 1));
3870 *pu16 = u16;
3871}
3872
3873
3874/**
3875 * Atomically writes a signed 16-bit value, ordered.
3876 *
3877 * @param pi16 Pointer to the 16-bit variable to write.
3878 * @param i16 The 16-bit value to assign to *pi16.
3879 */
3880DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3881{
3882#if defined(RT_ARCH_ARM64)
3883 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
3884#else
3885 ASMAtomicXchgS16(pi16, i16);
3886#endif
3887}
3888
3889
3890/**
3891 * Atomically writes a signed 16-bit value, unordered.
3892 *
3893 * @param pi16 Pointer to the 16-bit variable to write.
3894 * @param i16 The 16-bit value to assign to *pi16.
3895 */
3896DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3897{
3898 Assert(!((uintptr_t)pi16 & 1));
3899 *pi16 = i16;
3900}
3901
3902
3903/**
3904 * Atomically writes an unsigned 32-bit value, ordered.
3905 *
3906 * @param pu32 Pointer to the 32-bit variable to write.
3907 * @param u32 The 32-bit value to assign to *pu32.
3908 */
3909DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3910{
3911#if defined(RT_ARCH_ARM64)
3912 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
3913# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3914 RTASM_ARM_DMB_SY
3915 "swp %w[uValue], wzr, %[pMem]\n\t"
3916# else
3917 RTASM_ARM_DMB_SY
3918 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3919# endif
3920 : [pMem] "+Q" (*pu32)
3921 : [uValue] "r" (u32)
3922 : "cc");
3923#else
3924 ASMAtomicXchgU32(pu32, u32);
3925#endif
3926}
3927
3928
3929/**
3930 * Atomically writes an unsigned 32-bit value, unordered.
3931 *
3932 * @param pu32 Pointer to the 32-bit variable to write.
3933 * @param u32 The 32-bit value to assign to *pu32.
3934 */
3935DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3936{
3937 Assert(!((uintptr_t)pu32 & 3));
3938#if ARCH_BITS >= 32
3939 *pu32 = u32;
3940#else
3941 ASMAtomicXchgU32(pu32, u32);
3942#endif
3943}
3944
3945
3946/**
3947 * Atomically writes a signed 32-bit value, ordered.
3948 *
3949 * @param pi32 Pointer to the 32-bit variable to write.
3950 * @param i32 The 32-bit value to assign to *pi32.
3951 */
3952DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3953{
3954#if defined(RT_ARCH_ARM64)
3955 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
3956#else
3957 ASMAtomicXchgS32(pi32, i32);
3958#endif
3959}
3960
3961
3962/**
3963 * Atomically writes a signed 32-bit value, unordered.
3964 *
3965 * @param pi32 Pointer to the 32-bit variable to write.
3966 * @param i32 The 32-bit value to assign to *pi32.
3967 */
3968DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3969{
3970 Assert(!((uintptr_t)pi32 & 3));
3971#if ARCH_BITS >= 32
3972 *pi32 = i32;
3973#else
3974 ASMAtomicXchgS32(pi32, i32);
3975#endif
3976}
3977
3978
3979/**
3980 * Atomically writes an unsigned 64-bit value, ordered.
3981 *
3982 * @param pu64 Pointer to the 64-bit variable to write.
3983 * @param u64 The 64-bit value to assign to *pu64.
3984 */
3985DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
3986{
3987#if defined(RT_ARCH_ARM64)
3988 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
3989# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3990 RTASM_ARM_DMB_SY
3991 "swp %[uValue], xzr, %[pMem]\n\t"
3992# else
3993 RTASM_ARM_DMB_SY /** @todo necessary? */
3994 "stlr %[uValue], %[pMem]\n\t"
3995# endif
3996 : [pMem] "+Q" (*pu64)
3997 : [uValue] "r" (u64)
3998 : );
3999#else
4000 ASMAtomicXchgU64(pu64, u64);
4001#endif
4002}
4003
4004
4005/**
4006 * Atomically writes an unsigned 64-bit value, unordered.
4007 *
4008 * @param pu64 Pointer to the 64-bit variable to write.
4009 * @param u64 The 64-bit value to assign to *pu64.
4010 */
4011DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4012{
4013 Assert(!((uintptr_t)pu64 & 7));
4014#if ARCH_BITS == 64
4015 *pu64 = u64;
4016#else
4017 ASMAtomicXchgU64(pu64, u64);
4018#endif
4019}
4020
4021
4022/**
4023 * Atomically writes a signed 64-bit value, ordered.
4024 *
4025 * @param pi64 Pointer to the 64-bit variable to write.
4026 * @param i64 The 64-bit value to assign to *pi64.
4027 */
4028DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4029{
4030#if defined(RT_ARCH_ARM64)
4031 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4032#else
4033 ASMAtomicXchgS64(pi64, i64);
4034#endif
4035}
4036
4037
4038/**
4039 * Atomically writes a signed 64-bit value, unordered.
4040 *
4041 * @param pi64 Pointer to the 64-bit variable to write.
4042 * @param i64 The 64-bit value to assign to *pi64.
4043 */
4044DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4045{
4046 Assert(!((uintptr_t)pi64 & 7));
4047#if ARCH_BITS == 64
4048 *pi64 = i64;
4049#else
4050 ASMAtomicXchgS64(pi64, i64);
4051#endif
4052}
4053
4054
4055/**
4056 * Atomically writes a size_t value, ordered.
4057 *
4058 * @param pcb Pointer to the size_t variable to write.
4059 * @param cb The value to assign to *pcb.
4060 */
4061DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4062{
4063#if ARCH_BITS == 64
4064 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4065#elif ARCH_BITS == 32
4066 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4067#elif ARCH_BITS == 16
4068 AssertCompileSize(size_t, 2);
4069 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4070#else
4071# error "Unsupported ARCH_BITS value"
4072#endif
4073}
4074
4075
4076/**
4077 * Atomically writes a size_t value, unordered.
4078 *
4079 * @param pcb Pointer to the size_t variable to write.
4080 * @param cb The value to assign to *pcb.
4081 */
4082DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4083{
4084#if ARCH_BITS == 64
4085 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4086#elif ARCH_BITS == 32
4087 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4088#elif ARCH_BITS == 16
4089 AssertCompileSize(size_t, 2);
4090 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4091#else
4092# error "Unsupported ARCH_BITS value"
4093#endif
4094}
4095
4096
4097/**
4098 * Atomically writes a boolean value, unordered.
4099 *
4100 * @param pf Pointer to the boolean variable to write.
4101 * @param f The boolean value to assign to *pf.
4102 */
4103DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4104{
4105 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4106}
4107
4108
4109/**
4110 * Atomically writes a boolean value, unordered.
4111 *
4112 * @param pf Pointer to the boolean variable to write.
4113 * @param f The boolean value to assign to *pf.
4114 */
4115DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4116{
4117 *pf = f; /* byte writes are atomic on x86 */
4118}
4119
4120
4121/**
4122 * Atomically writes a pointer value, ordered.
4123 *
4124 * @param ppv Pointer to the pointer variable to write.
4125 * @param pv The pointer value to assign to *ppv.
4126 */
4127DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4128{
4129#if ARCH_BITS == 32 || ARCH_BITS == 16
4130 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4131#elif ARCH_BITS == 64
4132 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4133#else
4134# error "ARCH_BITS is bogus"
4135#endif
4136}
4137
4138
4139/**
4140 * Atomically writes a pointer value, unordered.
4141 *
4142 * @param ppv Pointer to the pointer variable to write.
4143 * @param pv The pointer value to assign to *ppv.
4144 */
4145DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4146{
4147#if ARCH_BITS == 32 || ARCH_BITS == 16
4148 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4149#elif ARCH_BITS == 64
4150 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4151#else
4152# error "ARCH_BITS is bogus"
4153#endif
4154}
4155
4156
4157/**
4158 * Atomically writes a pointer value, ordered.
4159 *
4160 * @param ppv Pointer to the pointer variable to write.
4161 * @param pv The pointer value to assign to *ppv. If NULL use
4162 * ASMAtomicWriteNullPtr or you'll land in trouble.
4163 *
4164 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4165 * NULL.
4166 */
4167#ifdef __GNUC__
4168# define ASMAtomicWritePtr(ppv, pv) \
4169 do \
4170 { \
4171 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4172 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4173 \
4174 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4175 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4176 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4177 \
4178 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4179 } while (0)
4180#else
4181# define ASMAtomicWritePtr(ppv, pv) \
4182 do \
4183 { \
4184 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4185 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4186 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4187 \
4188 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4189 } while (0)
4190#endif
4191
4192
4193/**
4194 * Atomically sets a pointer to NULL, ordered.
4195 *
4196 * @param ppv Pointer to the pointer variable that should be set to NULL.
4197 *
4198 * @remarks This is relatively type safe on GCC platforms.
4199 */
4200#if RT_GNUC_PREREQ(4, 2)
4201# define ASMAtomicWriteNullPtr(ppv) \
4202 do \
4203 { \
4204 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4205 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4206 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4207 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4208 } while (0)
4209#else
4210# define ASMAtomicWriteNullPtr(ppv) \
4211 do \
4212 { \
4213 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4214 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4215 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4216 } while (0)
4217#endif
4218
4219
4220/**
4221 * Atomically writes a pointer value, unordered.
4222 *
4223 * @returns Current *pv value
4224 * @param ppv Pointer to the pointer variable.
4225 * @param pv The pointer value to assign to *ppv. If NULL use
4226 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4227 *
4228 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4229 * NULL.
4230 */
4231#if RT_GNUC_PREREQ(4, 2)
4232# define ASMAtomicUoWritePtr(ppv, pv) \
4233 do \
4234 { \
4235 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4236 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4237 \
4238 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4239 AssertCompile(sizeof(pv) == sizeof(void *)); \
4240 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4241 \
4242 *(ppvTypeChecked) = pvTypeChecked; \
4243 } while (0)
4244#else
4245# define ASMAtomicUoWritePtr(ppv, pv) \
4246 do \
4247 { \
4248 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4249 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4250 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4251 *(ppv) = pv; \
4252 } while (0)
4253#endif
4254
4255
4256/**
4257 * Atomically sets a pointer to NULL, unordered.
4258 *
4259 * @param ppv Pointer to the pointer variable that should be set to NULL.
4260 *
4261 * @remarks This is relatively type safe on GCC platforms.
4262 */
4263#ifdef __GNUC__
4264# define ASMAtomicUoWriteNullPtr(ppv) \
4265 do \
4266 { \
4267 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4268 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4269 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4270 *(ppvTypeChecked) = NULL; \
4271 } while (0)
4272#else
4273# define ASMAtomicUoWriteNullPtr(ppv) \
4274 do \
4275 { \
4276 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4277 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4278 *(ppv) = NULL; \
4279 } while (0)
4280#endif
4281
4282
4283/**
4284 * Atomically write a typical IPRT handle value, ordered.
4285 *
4286 * @param ph Pointer to the variable to update.
4287 * @param hNew The value to assign to *ph.
4288 *
4289 * @remarks This doesn't currently work for all handles (like RTFILE).
4290 */
4291#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4292# define ASMAtomicWriteHandle(ph, hNew) \
4293 do { \
4294 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4295 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4296 } while (0)
4297#elif HC_ARCH_BITS == 64
4298# define ASMAtomicWriteHandle(ph, hNew) \
4299 do { \
4300 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4301 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4302 } while (0)
4303#else
4304# error HC_ARCH_BITS
4305#endif
4306
4307
4308/**
4309 * Atomically write a typical IPRT handle value, unordered.
4310 *
4311 * @param ph Pointer to the variable to update.
4312 * @param hNew The value to assign to *ph.
4313 *
4314 * @remarks This doesn't currently work for all handles (like RTFILE).
4315 */
4316#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4317# define ASMAtomicUoWriteHandle(ph, hNew) \
4318 do { \
4319 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4320 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4321 } while (0)
4322#elif HC_ARCH_BITS == 64
4323# define ASMAtomicUoWriteHandle(ph, hNew) \
4324 do { \
4325 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4326 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4327 } while (0)
4328#else
4329# error HC_ARCH_BITS
4330#endif
4331
4332
4333/**
4334 * Atomically write a value which size might differ
4335 * between platforms or compilers, ordered.
4336 *
4337 * @param pu Pointer to the variable to update.
4338 * @param uNew The value to assign to *pu.
4339 */
4340#define ASMAtomicWriteSize(pu, uNew) \
4341 do { \
4342 switch (sizeof(*(pu))) { \
4343 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4344 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4345 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4346 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4347 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4348 } \
4349 } while (0)
4350
4351/**
4352 * Atomically write a value which size might differ
4353 * between platforms or compilers, unordered.
4354 *
4355 * @param pu Pointer to the variable to update.
4356 * @param uNew The value to assign to *pu.
4357 */
4358#define ASMAtomicUoWriteSize(pu, uNew) \
4359 do { \
4360 switch (sizeof(*(pu))) { \
4361 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4362 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4363 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4364 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4365 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4366 } \
4367 } while (0)
4368
4369
4370
4371/**
4372 * Atomically exchanges and adds to a 16-bit value, ordered.
4373 *
4374 * @returns The old value.
4375 * @param pu16 Pointer to the value.
4376 * @param u16 Number to add.
4377 *
4378 * @remarks Currently not implemented, just to make 16-bit code happy.
4379 * @remarks x86: Requires a 486 or later.
4380 */
4381RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4382
4383
4384/**
4385 * Atomically exchanges and adds to a 32-bit value, ordered.
4386 *
4387 * @returns The old value.
4388 * @param pu32 Pointer to the value.
4389 * @param u32 Number to add.
4390 *
4391 * @remarks x86: Requires a 486 or later.
4392 */
4393#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4394RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4395#else
4396DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4397{
4398# if RT_INLINE_ASM_USES_INTRIN
4399 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4400 return u32;
4401
4402# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4403# if RT_INLINE_ASM_GNU_STYLE
4404 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4405 : "=r" (u32)
4406 , "=m" (*pu32)
4407 : "0" (u32)
4408 , "m" (*pu32)
4409 : "memory"
4410 , "cc");
4411 return u32;
4412# else
4413 __asm
4414 {
4415 mov eax, [u32]
4416# ifdef RT_ARCH_AMD64
4417 mov rdx, [pu32]
4418 lock xadd [rdx], eax
4419# else
4420 mov edx, [pu32]
4421 lock xadd [edx], eax
4422# endif
4423 mov [u32], eax
4424 }
4425 return u32;
4426# endif
4427
4428# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4429 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
4430# if defined(RTASM_ARM64_USE_FEAT_LSE)
4431 uint32_t u32OldRet;
4432 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
4433# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4434 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4435# else
4436 RTASM_ARM_DMB_SY
4437 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4438# endif
4439 : [pMem] "+Q" (*pu32)
4440 , [uOldActual] "=&r" (u32OldRet)
4441 : [uAddend] "r" (u32)
4442 : );
4443# else
4444 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
4445 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
4446 "add %[uNew], %[uOld], %[uVal]\n\t",
4447 [uVal] "r" (u32));
4448# endif
4449 return u32OldRet;
4450
4451# else
4452# error "Port me"
4453# endif
4454}
4455#endif
4456
4457
4458/**
4459 * Atomically exchanges and adds to a signed 32-bit value, ordered.
4460 *
4461 * @returns The old value.
4462 * @param pi32 Pointer to the value.
4463 * @param i32 Number to add.
4464 *
4465 * @remarks x86: Requires a 486 or later.
4466 */
4467DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4468{
4469 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
4470}
4471
4472
4473/**
4474 * Atomically exchanges and adds to a 64-bit value, ordered.
4475 *
4476 * @returns The old value.
4477 * @param pu64 Pointer to the value.
4478 * @param u64 Number to add.
4479 *
4480 * @remarks x86: Requires a Pentium or later.
4481 */
4482#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4483DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
4484#else
4485DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4486{
4487# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4488 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
4489 return u64;
4490
4491# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4492 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4493 : "=r" (u64)
4494 , "=m" (*pu64)
4495 : "0" (u64)
4496 , "m" (*pu64)
4497 : "memory"
4498 , "cc");
4499 return u64;
4500
4501# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4502# if defined(RTASM_ARM64_USE_FEAT_LSE)
4503 uint64_t u64OldRet;
4504 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
4505# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4506 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
4507# else
4508 RTASM_ARM_DMB_SY
4509 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
4510# endif
4511 : [pMem] "+Q" (*pu64)
4512 , [uOldActual] "=&r" (u64OldRet)
4513 : [uAddend] "r" (u64)
4514 : );
4515# else
4516 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
4517 "add %[uNew], %[uOld], %[uVal]\n\t"
4518 ,
4519 "add %[uNew], %[uOld], %[uVal]\n\t"
4520 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
4521 [uVal] "r" (u64));
4522# endif
4523 return u64OldRet;
4524
4525# else
4526 uint64_t u64Old;
4527 for (;;)
4528 {
4529 uint64_t u64New;
4530 u64Old = ASMAtomicUoReadU64(pu64);
4531 u64New = u64Old + u64;
4532 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
4533 break;
4534 ASMNopPause();
4535 }
4536 return u64Old;
4537# endif
4538}
4539#endif
4540
4541
4542/**
4543 * Atomically exchanges and adds to a signed 64-bit value, ordered.
4544 *
4545 * @returns The old value.
4546 * @param pi64 Pointer to the value.
4547 * @param i64 Number to add.
4548 *
4549 * @remarks x86: Requires a Pentium or later.
4550 */
4551DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4552{
4553 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
4554}
4555
4556
4557/**
4558 * Atomically exchanges and adds to a size_t value, ordered.
4559 *
4560 * @returns The old value.
4561 * @param pcb Pointer to the size_t value.
4562 * @param cb Number to add.
4563 */
4564DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4565{
4566#if ARCH_BITS == 64
4567 AssertCompileSize(size_t, 8);
4568 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
4569#elif ARCH_BITS == 32
4570 AssertCompileSize(size_t, 4);
4571 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
4572#elif ARCH_BITS == 16
4573 AssertCompileSize(size_t, 2);
4574 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
4575#else
4576# error "Unsupported ARCH_BITS value"
4577#endif
4578}
4579
4580
4581/**
4582 * Atomically exchanges and adds a value which size might differ between
4583 * platforms or compilers, ordered.
4584 *
4585 * @param pu Pointer to the variable to update.
4586 * @param uNew The value to add to *pu.
4587 * @param puOld Where to store the old value.
4588 */
4589#define ASMAtomicAddSize(pu, uNew, puOld) \
4590 do { \
4591 switch (sizeof(*(pu))) { \
4592 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4593 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4594 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
4595 } \
4596 } while (0)
4597
4598
4599
4600/**
4601 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
4602 *
4603 * @returns The old value.
4604 * @param pu16 Pointer to the value.
4605 * @param u16 Number to subtract.
4606 *
4607 * @remarks x86: Requires a 486 or later.
4608 */
4609DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
4610{
4611 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
4612}
4613
4614
4615/**
4616 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
4617 *
4618 * @returns The old value.
4619 * @param pi16 Pointer to the value.
4620 * @param i16 Number to subtract.
4621 *
4622 * @remarks x86: Requires a 486 or later.
4623 */
4624DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4625{
4626 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
4627}
4628
4629
4630/**
4631 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
4632 *
4633 * @returns The old value.
4634 * @param pu32 Pointer to the value.
4635 * @param u32 Number to subtract.
4636 *
4637 * @remarks x86: Requires a 486 or later.
4638 */
4639DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4640{
4641 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
4642}
4643
4644
4645/**
4646 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
4647 *
4648 * @returns The old value.
4649 * @param pi32 Pointer to the value.
4650 * @param i32 Number to subtract.
4651 *
4652 * @remarks x86: Requires a 486 or later.
4653 */
4654DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4655{
4656 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
4657}
4658
4659
4660/**
4661 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
4662 *
4663 * @returns The old value.
4664 * @param pu64 Pointer to the value.
4665 * @param u64 Number to subtract.
4666 *
4667 * @remarks x86: Requires a Pentium or later.
4668 */
4669DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4670{
4671 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
4672}
4673
4674
4675/**
4676 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
4677 *
4678 * @returns The old value.
4679 * @param pi64 Pointer to the value.
4680 * @param i64 Number to subtract.
4681 *
4682 * @remarks x86: Requires a Pentium or later.
4683 */
4684DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4685{
4686 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
4687}
4688
4689
4690/**
4691 * Atomically exchanges and subtracts to a size_t value, ordered.
4692 *
4693 * @returns The old value.
4694 * @param pcb Pointer to the size_t value.
4695 * @param cb Number to subtract.
4696 *
4697 * @remarks x86: Requires a 486 or later.
4698 */
4699DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4700{
4701#if ARCH_BITS == 64
4702 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
4703#elif ARCH_BITS == 32
4704 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
4705#elif ARCH_BITS == 16
4706 AssertCompileSize(size_t, 2);
4707 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
4708#else
4709# error "Unsupported ARCH_BITS value"
4710#endif
4711}
4712
4713
4714/**
4715 * Atomically exchanges and subtracts a value which size might differ between
4716 * platforms or compilers, ordered.
4717 *
4718 * @param pu Pointer to the variable to update.
4719 * @param uNew The value to subtract to *pu.
4720 * @param puOld Where to store the old value.
4721 *
4722 * @remarks x86: Requires a 486 or later.
4723 */
4724#define ASMAtomicSubSize(pu, uNew, puOld) \
4725 do { \
4726 switch (sizeof(*(pu))) { \
4727 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4728 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4729 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
4730 } \
4731 } while (0)
4732
4733
4734
4735/**
4736 * Atomically increment a 16-bit value, ordered.
4737 *
4738 * @returns The new value.
4739 * @param pu16 Pointer to the value to increment.
4740 * @remarks Not implemented. Just to make 16-bit code happy.
4741 *
4742 * @remarks x86: Requires a 486 or later.
4743 */
4744RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4745
4746
4747/**
4748 * Atomically increment a 32-bit value, ordered.
4749 *
4750 * @returns The new value.
4751 * @param pu32 Pointer to the value to increment.
4752 *
4753 * @remarks x86: Requires a 486 or later.
4754 */
4755#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4756RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4757#else
4758DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4759{
4760# if RT_INLINE_ASM_USES_INTRIN
4761 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
4762
4763# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4764# if RT_INLINE_ASM_GNU_STYLE
4765 uint32_t u32;
4766 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4767 : "=r" (u32)
4768 , "=m" (*pu32)
4769 : "0" (1)
4770 , "m" (*pu32)
4771 : "memory"
4772 , "cc");
4773 return u32+1;
4774# else
4775 __asm
4776 {
4777 mov eax, 1
4778# ifdef RT_ARCH_AMD64
4779 mov rdx, [pu32]
4780 lock xadd [rdx], eax
4781# else
4782 mov edx, [pu32]
4783 lock xadd [edx], eax
4784# endif
4785 mov u32, eax
4786 }
4787 return u32+1;
4788# endif
4789
4790# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4791 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
4792# if defined(RTASM_ARM64_USE_FEAT_LSE)
4793 uint32_t u32NewRet;
4794 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
4795# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4796 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4797# else
4798 RTASM_ARM_DMB_SY
4799 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4800# endif
4801 "add %w[uNewRet], %w[uNewRet], #1\n\t"
4802 : [pMem] "+Q" (*pu32)
4803 , [uNewRet] "=&r" (u32NewRet)
4804 : [uAddend] "r" ((uint32_t)1)
4805 : );
4806# else
4807 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
4808 "add %w[uNew], %w[uNew], #1\n\t",
4809 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
4810 "X" (0) /* dummy */);
4811# endif
4812 return u32NewRet;
4813
4814# else
4815 return ASMAtomicAddU32(pu32, 1) + 1;
4816# endif
4817}
4818#endif
4819
4820
4821/**
4822 * Atomically increment a signed 32-bit value, ordered.
4823 *
4824 * @returns The new value.
4825 * @param pi32 Pointer to the value to increment.
4826 *
4827 * @remarks x86: Requires a 486 or later.
4828 */
4829DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
4830{
4831 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
4832}
4833
4834
4835/**
4836 * Atomically increment a 64-bit value, ordered.
4837 *
4838 * @returns The new value.
4839 * @param pu64 Pointer to the value to increment.
4840 *
4841 * @remarks x86: Requires a Pentium or later.
4842 */
4843#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4844DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
4845#else
4846DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
4847{
4848# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4849 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
4850
4851# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4852 uint64_t u64;
4853 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4854 : "=r" (u64)
4855 , "=m" (*pu64)
4856 : "0" (1)
4857 , "m" (*pu64)
4858 : "memory"
4859 , "cc");
4860 return u64 + 1;
4861
4862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4863# if defined(RTASM_ARM64_USE_FEAT_LSE)
4864 uint64_t u64NewRet;
4865 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
4866# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4867 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
4868# else
4869 RTASM_ARM_DMB_SY
4870 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
4871# endif
4872 "add %[uNewRet], %[uNewRet], #1\n\t"
4873 : [pMem] "+Q" (*pu64)
4874 , [uNewRet] "=&r" (u64NewRet)
4875 : [uAddend] "r" ((uint64_t)1)
4876 : );
4877# else
4878 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
4879 "add %[uNew], %[uNew], #1\n\t"
4880 ,
4881 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
4882 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
4883 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
4884# endif
4885 return u64NewRet;
4886
4887# else
4888 return ASMAtomicAddU64(pu64, 1) + 1;
4889# endif
4890}
4891#endif
4892
4893
4894/**
4895 * Atomically increment a signed 64-bit value, ordered.
4896 *
4897 * @returns The new value.
4898 * @param pi64 Pointer to the value to increment.
4899 *
4900 * @remarks x86: Requires a Pentium or later.
4901 */
4902DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
4903{
4904 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
4905}
4906
4907
4908/**
4909 * Atomically increment a size_t value, ordered.
4910 *
4911 * @returns The new value.
4912 * @param pcb Pointer to the value to increment.
4913 *
4914 * @remarks x86: Requires a 486 or later.
4915 */
4916DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
4917{
4918#if ARCH_BITS == 64
4919 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
4920#elif ARCH_BITS == 32
4921 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
4922#elif ARCH_BITS == 16
4923 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
4924#else
4925# error "Unsupported ARCH_BITS value"
4926#endif
4927}
4928
4929
4930
4931/**
4932 * Atomically decrement an unsigned 32-bit value, ordered.
4933 *
4934 * @returns The new value.
4935 * @param pu16 Pointer to the value to decrement.
4936 * @remarks Not implemented. Just to make 16-bit code happy.
4937 *
4938 * @remarks x86: Requires a 486 or later.
4939 */
4940RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4941
4942
4943/**
4944 * Atomically decrement an unsigned 32-bit value, ordered.
4945 *
4946 * @returns The new value.
4947 * @param pu32 Pointer to the value to decrement.
4948 *
4949 * @remarks x86: Requires a 486 or later.
4950 */
4951#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4952RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4953#else
4954DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4955{
4956# if RT_INLINE_ASM_USES_INTRIN
4957 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
4958
4959# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4960# if RT_INLINE_ASM_GNU_STYLE
4961 uint32_t u32;
4962 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4963 : "=r" (u32)
4964 , "=m" (*pu32)
4965 : "0" (-1)
4966 , "m" (*pu32)
4967 : "memory"
4968 , "cc");
4969 return u32-1;
4970# else
4971 uint32_t u32;
4972 __asm
4973 {
4974 mov eax, -1
4975# ifdef RT_ARCH_AMD64
4976 mov rdx, [pu32]
4977 lock xadd [rdx], eax
4978# else
4979 mov edx, [pu32]
4980 lock xadd [edx], eax
4981# endif
4982 mov u32, eax
4983 }
4984 return u32-1;
4985# endif
4986
4987# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4988 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
4989# if defined(RTASM_ARM64_USE_FEAT_LSE)
4990 uint32_t u32NewRet;
4991 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
4992# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4993 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4994# else
4995 RTASM_ARM_DMB_SY
4996 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4997# endif
4998 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
4999 : [pMem] "+Q" (*pu32)
5000 , [uNewRet] "=&r" (u32NewRet)
5001 : [uAddend] "r" (~(uint32_t)0)
5002 : );
5003# else
5004 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5005 "sub %w[uNew], %w[uNew], #1\n\t",
5006 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5007 "X" (0) /* dummy */);
5008# endif
5009 return u32NewRet;
5010
5011# else
5012 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5013# endif
5014}
5015#endif
5016
5017
5018/**
5019 * Atomically decrement a signed 32-bit value, ordered.
5020 *
5021 * @returns The new value.
5022 * @param pi32 Pointer to the value to decrement.
5023 *
5024 * @remarks x86: Requires a 486 or later.
5025 */
5026DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5027{
5028 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5029}
5030
5031
5032/**
5033 * Atomically decrement an unsigned 64-bit value, ordered.
5034 *
5035 * @returns The new value.
5036 * @param pu64 Pointer to the value to decrement.
5037 *
5038 * @remarks x86: Requires a Pentium or later.
5039 */
5040#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5041RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5042#else
5043DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5044{
5045# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5046 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5047
5048# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5049 uint64_t u64;
5050 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5051 : "=r" (u64)
5052 , "=m" (*pu64)
5053 : "0" (~(uint64_t)0)
5054 , "m" (*pu64)
5055 : "memory"
5056 , "cc");
5057 return u64-1;
5058
5059# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5060# if defined(RTASM_ARM64_USE_FEAT_LSE)
5061 uint64_t u64NewRet;
5062 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5063# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5064 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5065# else
5066 RTASM_ARM_DMB_SY
5067 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5068# endif
5069 "sub %[uNewRet], %[uNewRet], #1\n\t"
5070 : [pMem] "+Q" (*pu64)
5071 , [uNewRet] "=&r" (u64NewRet)
5072 : [uAddend] "r" (~(uint64_t)0)
5073 : );
5074# else
5075 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5076 "sub %[uNew], %[uNew], #1\n\t"
5077 ,
5078 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5079 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5080 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5081# endif
5082 return u64NewRet;
5083
5084# else
5085 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5086# endif
5087}
5088#endif
5089
5090
5091/**
5092 * Atomically decrement a signed 64-bit value, ordered.
5093 *
5094 * @returns The new value.
5095 * @param pi64 Pointer to the value to decrement.
5096 *
5097 * @remarks x86: Requires a Pentium or later.
5098 */
5099DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5100{
5101 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5102}
5103
5104
5105/**
5106 * Atomically decrement a size_t value, ordered.
5107 *
5108 * @returns The new value.
5109 * @param pcb Pointer to the value to decrement.
5110 *
5111 * @remarks x86: Requires a 486 or later.
5112 */
5113DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5114{
5115#if ARCH_BITS == 64
5116 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5117#elif ARCH_BITS == 32
5118 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5119#elif ARCH_BITS == 16
5120 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5121#else
5122# error "Unsupported ARCH_BITS value"
5123#endif
5124}
5125
5126
5127/**
5128 * Atomically Or an unsigned 32-bit value, ordered.
5129 *
5130 * @param pu32 Pointer to the pointer variable to OR u32 with.
5131 * @param u32 The value to OR *pu32 with.
5132 *
5133 * @remarks x86: Requires a 386 or later.
5134 */
5135#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5136RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5137#else
5138DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5139{
5140# if RT_INLINE_ASM_USES_INTRIN
5141 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5142
5143# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5144# if RT_INLINE_ASM_GNU_STYLE
5145 __asm__ __volatile__("lock; orl %1, %0\n\t"
5146 : "=m" (*pu32)
5147 : "ir" (u32)
5148 , "m" (*pu32)
5149 : "cc");
5150# else
5151 __asm
5152 {
5153 mov eax, [u32]
5154# ifdef RT_ARCH_AMD64
5155 mov rdx, [pu32]
5156 lock or [rdx], eax
5157# else
5158 mov edx, [pu32]
5159 lock or [edx], eax
5160# endif
5161 }
5162# endif
5163
5164# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5165# if defined(RTASM_ARM64_USE_FEAT_LSE)
5166# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5167 uint32_t u32Spill;
5168 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5169 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5170 : [pMem] "+Q" (*pu32)
5171 , [uSpill] "=&r" (u32Spill)
5172 : [fBitsToSet] "r" (u32)
5173 : );
5174# else
5175 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5176 RTASM_ARM_DMB_SY
5177 "stset %w[fBitsToSet], %[pMem]\n\t"
5178 : [pMem] "+Q" (*pu32)
5179 : [fBitsToSet] "r" (u32)
5180 : );
5181# endif
5182# else
5183 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5184 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5185 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5186 "orr %[uNew], %[uNew], %[uVal]\n\t",
5187 [uVal] "r" (u32));
5188
5189# endif
5190# else
5191# error "Port me"
5192# endif
5193}
5194#endif
5195
5196
5197/**
5198 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5199 * fallback).
5200 *
5201 * @returns Old value.
5202 * @param pu32 Pointer to the variable to OR @a u32 with.
5203 * @param u32 The value to OR @a *pu32 with.
5204 */
5205DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5206{
5207#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5208# if defined(RTASM_ARM64_USE_FEAT_LSE)
5209 uint32_t u32OldRet;
5210 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5211# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5212 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5213# else
5214 RTASM_ARM_DMB_SY
5215 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5216# endif
5217 : [pMem] "+Q" (*pu32)
5218 , [uOldRet] "=&r" (u32OldRet)
5219 : [fBitsToSet] "r" (u32)
5220 : );
5221# else
5222 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5223 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5224 "orr %[uNew], %[uOld], %[uVal]\n\t",
5225 [uVal] "r" (u32));
5226# endif
5227 return u32OldRet;
5228
5229#else
5230 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5231 uint32_t u32New;
5232 do
5233 u32New = u32RetOld | u32;
5234 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5235 return u32RetOld;
5236#endif
5237}
5238
5239
5240/**
5241 * Atomically Or a signed 32-bit value, ordered.
5242 *
5243 * @param pi32 Pointer to the pointer variable to OR u32 with.
5244 * @param i32 The value to OR *pu32 with.
5245 *
5246 * @remarks x86: Requires a 386 or later.
5247 */
5248DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5249{
5250 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5251}
5252
5253
5254/**
5255 * Atomically Or an unsigned 64-bit value, ordered.
5256 *
5257 * @param pu64 Pointer to the pointer variable to OR u64 with.
5258 * @param u64 The value to OR *pu64 with.
5259 *
5260 * @remarks x86: Requires a Pentium or later.
5261 */
5262#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5263DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5264#else
5265DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5266{
5267# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5268 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5269
5270# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5271 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5272 : "=m" (*pu64)
5273 : "r" (u64)
5274 , "m" (*pu64)
5275 : "cc");
5276
5277# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5278# if defined(RTASM_ARM64_USE_FEAT_LSE)
5279# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5280 uint64_t u64Spill;
5281 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5282 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5283 : [pMem] "+Q" (*pu64)
5284 , [uSpill] "=&r" (u64Spill)
5285 : [fBitsToSet] "r" (u64)
5286 : );
5287# else
5288 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5289 RTASM_ARM_DMB_SY
5290 "stset %[fBitsToSet], %[pMem]\n\t"
5291 : [pMem] "+Q" (*pu64)
5292 : [fBitsToSet] "r" (u64)
5293 : );
5294# endif
5295# else
5296 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5297 "orr %[uNew], %[uNew], %[uVal]\n\t"
5298 ,
5299 "orr %[uNew], %[uNew], %[uVal]\n\t"
5300 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5301 [uVal] "r" (u64));
5302# endif
5303
5304# else
5305 for (;;)
5306 {
5307 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5308 uint64_t u64New = u64Old | u64;
5309 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5310 break;
5311 ASMNopPause();
5312 }
5313# endif
5314}
5315#endif
5316
5317
5318/**
5319 * Atomically Or a signed 64-bit value, ordered.
5320 *
5321 * @param pi64 Pointer to the pointer variable to OR u64 with.
5322 * @param i64 The value to OR *pu64 with.
5323 *
5324 * @remarks x86: Requires a Pentium or later.
5325 */
5326DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5327{
5328 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5329}
5330
5331
5332/**
5333 * Atomically And an unsigned 32-bit value, ordered.
5334 *
5335 * @param pu32 Pointer to the pointer variable to AND u32 with.
5336 * @param u32 The value to AND *pu32 with.
5337 *
5338 * @remarks x86: Requires a 386 or later.
5339 */
5340#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5341RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5342#else
5343DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5344{
5345# if RT_INLINE_ASM_USES_INTRIN
5346 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5347
5348# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5349# if RT_INLINE_ASM_GNU_STYLE
5350 __asm__ __volatile__("lock; andl %1, %0\n\t"
5351 : "=m" (*pu32)
5352 : "ir" (u32)
5353 , "m" (*pu32)
5354 : "cc");
5355# else
5356 __asm
5357 {
5358 mov eax, [u32]
5359# ifdef RT_ARCH_AMD64
5360 mov rdx, [pu32]
5361 lock and [rdx], eax
5362# else
5363 mov edx, [pu32]
5364 lock and [edx], eax
5365# endif
5366 }
5367# endif
5368
5369# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5370# if defined(RTASM_ARM64_USE_FEAT_LSE)
5371# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5372 uint32_t u32Spill;
5373 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5374 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5375 : [pMem] "+Q" (*pu32)
5376 , [uSpill] "=&r" (u32Spill)
5377 : [fBitsToClear] "r" (~u32)
5378 : );
5379# else
5380 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5381 RTASM_ARM_DMB_SY
5382 "stclr %w[fBitsToClear], %[pMem]\n\t"
5383 : [pMem] "+Q" (*pu32)
5384 : [fBitsToClear] "r" (~u32)
5385 : );
5386# endif
5387# else
5388 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5389 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5390 "and %[uNew], %[uNew], %[uVal]\n\t",
5391 [uVal] "r" (u32));
5392
5393# endif
5394# else
5395# error "Port me"
5396# endif
5397}
5398#endif
5399
5400
5401/**
5402 * Atomically AND an unsigned 32-bit value, ordered, extended version.
5403 *
5404 * @returns Old value.
5405 * @param pu32 Pointer to the variable to AND @a u32 with.
5406 * @param u32 The value to AND @a *pu32 with.
5407 */
5408DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5409{
5410#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5411# if defined(RTASM_ARM64_USE_FEAT_LSE)
5412 uint32_t u32OldRet;
5413 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5414# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5415 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5416# else
5417 RTASM_ARM_DMB_SY
5418 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5419# endif
5420 : [pMem] "+Q" (*pu32)
5421 , [uOldRet] "=&r" (u32OldRet)
5422 : [fBitsToClear] "r" (~u32)
5423 : );
5424# else
5425 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
5426 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5427 "and %[uNew], %[uOld], %[uVal]\n\t",
5428 [uVal] "r" (u32));
5429# endif
5430 return u32OldRet;
5431
5432#else
5433 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5434 uint32_t u32New;
5435 do
5436 u32New = u32RetOld & u32;
5437 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5438 return u32RetOld;
5439#endif
5440}
5441
5442
5443/**
5444 * Atomically And a signed 32-bit value, ordered.
5445 *
5446 * @param pi32 Pointer to the pointer variable to AND i32 with.
5447 * @param i32 The value to AND *pi32 with.
5448 *
5449 * @remarks x86: Requires a 386 or later.
5450 */
5451DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5452{
5453 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5454}
5455
5456
5457/**
5458 * Atomically And an unsigned 64-bit value, ordered.
5459 *
5460 * @param pu64 Pointer to the pointer variable to AND u64 with.
5461 * @param u64 The value to AND *pu64 with.
5462 *
5463 * @remarks x86: Requires a Pentium or later.
5464 */
5465#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5466DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5467#else
5468DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5469{
5470# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5471 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
5472
5473# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5474 __asm__ __volatile__("lock; andq %1, %0\n\t"
5475 : "=m" (*pu64)
5476 : "r" (u64)
5477 , "m" (*pu64)
5478 : "cc");
5479
5480# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5481# if defined(RTASM_ARM64_USE_FEAT_LSE)
5482# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5483 uint64_t u64Spill;
5484 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5485 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
5486 : [pMem] "+Q" (*pu64)
5487 , [uSpill] "=&r" (u64Spill)
5488 : [fBitsToClear] "r" (~u64)
5489 : );
5490# else
5491 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5492 RTASM_ARM_DMB_SY
5493 "stclr %[fBitsToClear], %[pMem]\n\t"
5494 : [pMem] "+Q" (*pu64)
5495 : [fBitsToClear] "r" (~u64)
5496 : );
5497# endif
5498# else
5499 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
5500 "and %[uNew], %[uNew], %[uVal]\n\t"
5501 ,
5502 "and %[uNew], %[uNew], %[uVal]\n\t"
5503 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5504 [uVal] "r" (u64));
5505# endif
5506
5507# else
5508 for (;;)
5509 {
5510 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5511 uint64_t u64New = u64Old & u64;
5512 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5513 break;
5514 ASMNopPause();
5515 }
5516# endif
5517}
5518#endif
5519
5520
5521/**
5522 * Atomically And a signed 64-bit value, ordered.
5523 *
5524 * @param pi64 Pointer to the pointer variable to AND i64 with.
5525 * @param i64 The value to AND *pi64 with.
5526 *
5527 * @remarks x86: Requires a Pentium or later.
5528 */
5529DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5530{
5531 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5532}
5533
5534
5535/**
5536 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
5537 *
5538 * @param pu32 Pointer to the variable to XOR @a u32 with.
5539 * @param u32 The value to XOR @a *pu32 with.
5540 *
5541 * @remarks x86: Requires a 386 or later.
5542 */
5543#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5544RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5545#else
5546DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5547{
5548# if RT_INLINE_ASM_USES_INTRIN
5549 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
5550
5551# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5552# if RT_INLINE_ASM_GNU_STYLE
5553 __asm__ __volatile__("lock; xorl %1, %0\n\t"
5554 : "=m" (*pu32)
5555 : "ir" (u32)
5556 , "m" (*pu32)
5557 : "cc");
5558# else
5559 __asm
5560 {
5561 mov eax, [u32]
5562# ifdef RT_ARCH_AMD64
5563 mov rdx, [pu32]
5564 lock xor [rdx], eax
5565# else
5566 mov edx, [pu32]
5567 lock xor [edx], eax
5568# endif
5569 }
5570# endif
5571
5572# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5573# if defined(RTASM_ARM64_USE_FEAT_LSE)
5574# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5575 uint32_t u32Spill;
5576 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5577 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
5578 : [pMem] "+Q" (*pu32)
5579 , [uSpill] "=&r" (u32Spill)
5580 : [fBitMask] "r" (u32)
5581 : );
5582# else
5583 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5584 RTASM_ARM_DMB_SY
5585 "steor %w[fBitMask], %[pMem]\n\t"
5586 : [pMem] "+Q" (*pu32)
5587 : [fBitMask] "r" (u32)
5588 : );
5589# endif
5590# else
5591 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
5592 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
5593 "eor %[uNew], %[uNew], %[uVal]\n\t",
5594 [uVal] "r" (u32));
5595# endif
5596
5597# else
5598# error "Port me"
5599# endif
5600}
5601#endif
5602
5603
5604/**
5605 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
5606 * extended version (for bitmaps).
5607 *
5608 * @returns Old value.
5609 * @param pu32 Pointer to the variable to XOR @a u32 with.
5610 * @param u32 The value to XOR @a *pu32 with.
5611 */
5612DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5613{
5614#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5615# if defined(RTASM_ARM64_USE_FEAT_LSE)
5616 uint32_t u32OldRet;
5617 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
5618# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5619 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5620# else
5621 RTASM_ARM_DMB_SY
5622 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5623# endif
5624 : [pMem] "+Q" (*pu32)
5625 , [uOldRet] "=&r" (u32OldRet)
5626 : [fBitMask] "r" (u32)
5627 : );
5628# else
5629 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
5630 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
5631 "eor %[uNew], %[uOld], %[uVal]\n\t",
5632 [uVal] "r" (u32));
5633# endif
5634 return u32OldRet;
5635
5636#else
5637 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5638 uint32_t u32New;
5639 do
5640 u32New = u32RetOld ^ u32;
5641 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5642 return u32RetOld;
5643#endif
5644}
5645
5646
5647/**
5648 * Atomically XOR a signed 32-bit value, ordered.
5649 *
5650 * @param pi32 Pointer to the variable to XOR i32 with.
5651 * @param i32 The value to XOR *pi32 with.
5652 *
5653 * @remarks x86: Requires a 386 or later.
5654 */
5655DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5656{
5657 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5658}
5659
5660
5661/**
5662 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
5663 *
5664 * @param pu32 Pointer to the pointer variable to OR u32 with.
5665 * @param u32 The value to OR *pu32 with.
5666 *
5667 * @remarks x86: Requires a 386 or later.
5668 */
5669#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5670RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5671#else
5672DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5673{
5674# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5675# if RT_INLINE_ASM_GNU_STYLE
5676 __asm__ __volatile__("orl %1, %0\n\t"
5677 : "=m" (*pu32)
5678 : "ir" (u32)
5679 , "m" (*pu32)
5680 : "cc");
5681# else
5682 __asm
5683 {
5684 mov eax, [u32]
5685# ifdef RT_ARCH_AMD64
5686 mov rdx, [pu32]
5687 or [rdx], eax
5688# else
5689 mov edx, [pu32]
5690 or [edx], eax
5691# endif
5692 }
5693# endif
5694
5695# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5696 /* M1 benchmark: stset=1974 vs non-lse=6271 */
5697# if defined(RTASM_ARM64_USE_FEAT_LSE)
5698 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
5699 "stset %w[fBitsToSet], %[pMem]\n\t"
5700 : [pMem] "+Q" (*pu32)
5701 : [fBitsToSet] "r" (u32)
5702 : );
5703# else
5704 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
5705 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5706 "orr %[uNew], %[uNew], %[uVal]\n\t",
5707 [uVal] "r" (u32));
5708# endif
5709
5710# else
5711# error "Port me"
5712# endif
5713}
5714#endif
5715
5716
5717/**
5718 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
5719 * extended version (for bitmap fallback).
5720 *
5721 * @returns Old value.
5722 * @param pu32 Pointer to the variable to OR @a u32 with.
5723 * @param u32 The value to OR @a *pu32 with.
5724 */
5725DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5726{
5727#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5728# if defined(RTASM_ARM64_USE_FEAT_LSE)
5729 uint32_t u32OldRet;
5730 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5731 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5732 : [pMem] "+Q" (*pu32)
5733 , [uOldRet] "=&r" (u32OldRet)
5734 : [fBitsToSet] "r" (u32)
5735 : );
5736# else
5737 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
5738 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5739 "orr %[uNew], %[uOld], %[uVal]\n\t",
5740 [uVal] "r" (u32));
5741# endif
5742 return u32OldRet;
5743
5744#else
5745 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5746#endif
5747}
5748
5749
5750/**
5751 * Atomically OR a signed 32-bit value, unordered.
5752 *
5753 * @param pi32 Pointer to the pointer variable to OR u32 with.
5754 * @param i32 The value to OR *pu32 with.
5755 *
5756 * @remarks x86: Requires a 386 or later.
5757 */
5758DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5759{
5760 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5761}
5762
5763
5764/**
5765 * Atomically OR an unsigned 64-bit value, unordered.
5766 *
5767 * @param pu64 Pointer to the pointer variable to OR u64 with.
5768 * @param u64 The value to OR *pu64 with.
5769 *
5770 * @remarks x86: Requires a Pentium or later.
5771 */
5772#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5773DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5774#else
5775DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5776{
5777# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5778 __asm__ __volatile__("orq %1, %q0\n\t"
5779 : "=m" (*pu64)
5780 : "r" (u64)
5781 , "m" (*pu64)
5782 : "cc");
5783
5784# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5785# if defined(RTASM_ARM64_USE_FEAT_LSE)
5786 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
5787 "stset %[fBitsToSet], %[pMem]\n\t"
5788 : [pMem] "+Q" (*pu64)
5789 : [fBitsToSet] "r" (u64)
5790 : );
5791# else
5792 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
5793 "orr %[uNew], %[uNew], %[uVal]\n\t"
5794 ,
5795 "orr %[uNew], %[uNew], %[uVal]\n\t"
5796 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5797 [uVal] "r" (u64));
5798# endif
5799
5800# else
5801 for (;;)
5802 {
5803 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5804 uint64_t u64New = u64Old | u64;
5805 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5806 break;
5807 ASMNopPause();
5808 }
5809# endif
5810}
5811#endif
5812
5813
5814/**
5815 * Atomically Or a signed 64-bit value, unordered.
5816 *
5817 * @param pi64 Pointer to the pointer variable to OR u64 with.
5818 * @param i64 The value to OR *pu64 with.
5819 *
5820 * @remarks x86: Requires a Pentium or later.
5821 */
5822DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5823{
5824 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5825}
5826
5827
5828/**
5829 * Atomically And an unsigned 32-bit value, unordered.
5830 *
5831 * @param pu32 Pointer to the pointer variable to AND u32 with.
5832 * @param u32 The value to AND *pu32 with.
5833 *
5834 * @remarks x86: Requires a 386 or later.
5835 */
5836#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5837RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5838#else
5839DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5840{
5841# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5842# if RT_INLINE_ASM_GNU_STYLE
5843 __asm__ __volatile__("andl %1, %0\n\t"
5844 : "=m" (*pu32)
5845 : "ir" (u32)
5846 , "m" (*pu32)
5847 : "cc");
5848# else
5849 __asm
5850 {
5851 mov eax, [u32]
5852# ifdef RT_ARCH_AMD64
5853 mov rdx, [pu32]
5854 and [rdx], eax
5855# else
5856 mov edx, [pu32]
5857 and [edx], eax
5858# endif
5859 }
5860# endif
5861
5862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5863 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
5864# if defined(RTASM_ARM64_USE_FEAT_LSE)
5865 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
5866 "stclr %w[fBitsToClear], %[pMem]\n\t"
5867 : [pMem] "+Q" (*pu32)
5868 : [fBitsToClear] "r" (~u32)
5869 : );
5870# else
5871 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
5872 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5873 "and %[uNew], %[uNew], %[uVal]\n\t",
5874 [uVal] "r" (u32));
5875# endif
5876
5877# else
5878# error "Port me"
5879# endif
5880}
5881#endif
5882
5883
5884/**
5885 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
5886 * bitmap fallback).
5887 *
5888 * @returns Old value.
5889 * @param pu32 Pointer to the pointer to AND @a u32 with.
5890 * @param u32 The value to AND @a *pu32 with.
5891 */
5892DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5893{
5894#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5895# if defined(RTASM_ARM64_USE_FEAT_LSE)
5896 uint32_t u32OldRet;
5897 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5898 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5899 : [pMem] "+Q" (*pu32)
5900 , [uOldRet] "=&r" (u32OldRet)
5901 : [fBitsToClear] "r" (~u32)
5902 : );
5903# else
5904 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
5905 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5906 "and %[uNew], %[uOld], %[uVal]\n\t",
5907 [uVal] "r" (u32));
5908# endif
5909 return u32OldRet;
5910
5911#else
5912 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5913#endif
5914}
5915
5916
5917/**
5918 * Atomically And a signed 32-bit value, unordered.
5919 *
5920 * @param pi32 Pointer to the pointer variable to AND i32 with.
5921 * @param i32 The value to AND *pi32 with.
5922 *
5923 * @remarks x86: Requires a 386 or later.
5924 */
5925DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5926{
5927 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5928}
5929
5930
5931/**
5932 * Atomically And an unsigned 64-bit value, unordered.
5933 *
5934 * @param pu64 Pointer to the pointer variable to AND u64 with.
5935 * @param u64 The value to AND *pu64 with.
5936 *
5937 * @remarks x86: Requires a Pentium or later.
5938 */
5939#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5940DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5941#else
5942DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5943{
5944# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5945 __asm__ __volatile__("andq %1, %0\n\t"
5946 : "=m" (*pu64)
5947 : "r" (u64)
5948 , "m" (*pu64)
5949 : "cc");
5950
5951# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5952# if defined(RTASM_ARM64_USE_FEAT_LSE)
5953 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
5954 "stclr %[fBitsToClear], %[pMem]\n\t"
5955 : [pMem] "+Q" (*pu64)
5956 : [fBitsToClear] "r" (~u64)
5957 : );
5958# else
5959 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
5960 "and %[uNew], %[uNew], %[uVal]\n\t"
5961 ,
5962 "and %[uNew], %[uNew], %[uVal]\n\t"
5963 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5964 [uVal] "r" (u64));
5965# endif
5966
5967# else
5968 for (;;)
5969 {
5970 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5971 uint64_t u64New = u64Old & u64;
5972 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5973 break;
5974 ASMNopPause();
5975 }
5976# endif
5977}
5978#endif
5979
5980
5981/**
5982 * Atomically And a signed 64-bit value, unordered.
5983 *
5984 * @param pi64 Pointer to the pointer variable to AND i64 with.
5985 * @param i64 The value to AND *pi64 with.
5986 *
5987 * @remarks x86: Requires a Pentium or later.
5988 */
5989DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5990{
5991 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5992}
5993
5994
5995/**
5996 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
5997 *
5998 * @param pu32 Pointer to the variable to XOR @a u32 with.
5999 * @param u32 The value to OR @a *pu32 with.
6000 *
6001 * @remarks x86: Requires a 386 or later.
6002 */
6003#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6004RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6005#else
6006DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6007{
6008# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6009# if RT_INLINE_ASM_GNU_STYLE
6010 __asm__ __volatile__("xorl %1, %0\n\t"
6011 : "=m" (*pu32)
6012 : "ir" (u32)
6013 , "m" (*pu32)
6014 : "cc");
6015# else
6016 __asm
6017 {
6018 mov eax, [u32]
6019# ifdef RT_ARCH_AMD64
6020 mov rdx, [pu32]
6021 xor [rdx], eax
6022# else
6023 mov edx, [pu32]
6024 xor [edx], eax
6025# endif
6026 }
6027# endif
6028
6029# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6030# if defined(RTASM_ARM64_USE_FEAT_LSE)
6031 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6032 "steor %w[fBitMask], %[pMem]\n\t"
6033 : [pMem] "+Q" (*pu32)
6034 : [fBitMask] "r" (u32)
6035 : );
6036# else
6037 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6038 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6039 "eor %[uNew], %[uNew], %[uVal]\n\t",
6040 [uVal] "r" (u32));
6041# endif
6042
6043# else
6044# error "Port me"
6045# endif
6046}
6047#endif
6048
6049
6050/**
6051 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6052 * extended version (for bitmap fallback).
6053 *
6054 * @returns Old value.
6055 * @param pu32 Pointer to the variable to XOR @a u32 with.
6056 * @param u32 The value to OR @a *pu32 with.
6057 */
6058DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6059{
6060#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6061# if defined(RTASM_ARM64_USE_FEAT_LSE)
6062 uint32_t u32OldRet;
6063 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6064 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6065 : [pMem] "+Q" (*pu32)
6066 , [uOldRet] "=&r" (u32OldRet)
6067 : [fBitMask] "r" (u32)
6068 : );
6069# else
6070 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6071 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6072 "eor %[uNew], %[uOld], %[uVal]\n\t",
6073 [uVal] "r" (u32));
6074# endif
6075 return u32OldRet;
6076
6077#else
6078 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6079#endif
6080}
6081
6082
6083/**
6084 * Atomically XOR a signed 32-bit value, unordered.
6085 *
6086 * @param pi32 Pointer to the variable to XOR @a u32 with.
6087 * @param i32 The value to XOR @a *pu32 with.
6088 *
6089 * @remarks x86: Requires a 386 or later.
6090 */
6091DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6092{
6093 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6094}
6095
6096
6097/**
6098 * Atomically increment an unsigned 32-bit value, unordered.
6099 *
6100 * @returns the new value.
6101 * @param pu32 Pointer to the variable to increment.
6102 *
6103 * @remarks x86: Requires a 486 or later.
6104 */
6105#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6106RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6107#else
6108DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6109{
6110# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6111 uint32_t u32;
6112# if RT_INLINE_ASM_GNU_STYLE
6113 __asm__ __volatile__("xaddl %0, %1\n\t"
6114 : "=r" (u32)
6115 , "=m" (*pu32)
6116 : "0" (1)
6117 , "m" (*pu32)
6118 : "memory" /** @todo why 'memory'? */
6119 , "cc");
6120 return u32 + 1;
6121# else
6122 __asm
6123 {
6124 mov eax, 1
6125# ifdef RT_ARCH_AMD64
6126 mov rdx, [pu32]
6127 xadd [rdx], eax
6128# else
6129 mov edx, [pu32]
6130 xadd [edx], eax
6131# endif
6132 mov u32, eax
6133 }
6134 return u32 + 1;
6135# endif
6136
6137# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6138 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6139# if defined(RTASM_ARM64_USE_FEAT_LSE)
6140 uint32_t u32NewRet;
6141 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6142 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6143 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6144 : [pMem] "+Q" (*pu32)
6145 , [uNewRet] "=&r" (u32NewRet)
6146 : [uAddend] "r" ((uint32_t)1)
6147 : );
6148# else
6149 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6150 "add %w[uNew], %w[uNew], #1\n\t",
6151 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6152 "X" (0) /* dummy */);
6153# endif
6154 return u32NewRet;
6155
6156# else
6157# error "Port me"
6158# endif
6159}
6160#endif
6161
6162
6163/**
6164 * Atomically decrement an unsigned 32-bit value, unordered.
6165 *
6166 * @returns the new value.
6167 * @param pu32 Pointer to the variable to decrement.
6168 *
6169 * @remarks x86: Requires a 486 or later.
6170 */
6171#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6172RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6173#else
6174DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6175{
6176# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6177 uint32_t u32;
6178# if RT_INLINE_ASM_GNU_STYLE
6179 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6180 : "=r" (u32)
6181 , "=m" (*pu32)
6182 : "0" (-1)
6183 , "m" (*pu32)
6184 : "memory"
6185 , "cc");
6186 return u32 - 1;
6187# else
6188 __asm
6189 {
6190 mov eax, -1
6191# ifdef RT_ARCH_AMD64
6192 mov rdx, [pu32]
6193 xadd [rdx], eax
6194# else
6195 mov edx, [pu32]
6196 xadd [edx], eax
6197# endif
6198 mov u32, eax
6199 }
6200 return u32 - 1;
6201# endif
6202
6203# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6204 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6205# if defined(RTASM_ARM64_USE_FEAT_LSE)
6206 uint32_t u32NewRet;
6207 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6208 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6209 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6210 : [pMem] "+Q" (*pu32)
6211 , [uNewRet] "=&r" (u32NewRet)
6212 : [uAddend] "r" (~(uint32_t)0)
6213 : );
6214# else
6215 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6216 "sub %w[uNew], %w[uNew], #1\n\t",
6217 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6218 "X" (0) /* dummy */);
6219# endif
6220 return u32NewRet;
6221
6222# else
6223# error "Port me"
6224# endif
6225}
6226#endif
6227
6228/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6229 * header as it's a common reason for including asm.h. */
6230
6231
6232/**
6233 * Reverse the byte order of the given 16-bit integer.
6234 *
6235 * @returns Revert
6236 * @param u16 16-bit integer value.
6237 */
6238#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6239RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6240#else
6241DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6242{
6243# if RT_INLINE_ASM_USES_INTRIN
6244 return _byteswap_ushort(u16);
6245
6246# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6247# if RT_INLINE_ASM_GNU_STYLE
6248 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6249# else
6250 _asm
6251 {
6252 mov ax, [u16]
6253 ror ax, 8
6254 mov [u16], ax
6255 }
6256# endif
6257 return u16;
6258
6259# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6260 uint32_t u32Ret;
6261 __asm__ __volatile__(
6262# if defined(RT_ARCH_ARM64)
6263 "rev16 %w[uRet], %w[uVal]\n\t"
6264# else
6265 "rev16 %[uRet], %[uVal]\n\t"
6266# endif
6267 : [uRet] "=r" (u32Ret)
6268 : [uVal] "r" (u16));
6269 return (uint16_t)u32Ret;
6270
6271# else
6272# error "Port me"
6273# endif
6274}
6275#endif
6276
6277
6278/**
6279 * Reverse the byte order of the given 32-bit integer.
6280 *
6281 * @returns Revert
6282 * @param u32 32-bit integer value.
6283 */
6284#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6285RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6286#else
6287DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6288{
6289# if RT_INLINE_ASM_USES_INTRIN
6290 return _byteswap_ulong(u32);
6291
6292# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6293