VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl-arm64.S

Last change on this file was 104346, checked in by vboxsync, 5 weeks ago

VMM/IEM: Fixed regression from r162777 that broke 8- and 16-bit wide ROL on arm64. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
Line 
1/* $Id: IEMAllAImpl-arm64.S 104346 2024-04-17 14:30:45Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, ARM64 variant.
4 */
5
6/*
7 * Copyright (C) 2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include <iprt/asmdefs-arm.h>
33#include <iprt/x86.h>
34
35#define IEM_AIMPL_FUNCTION_ALIGNMENT 0x20
36
37
38#if RT_CLANG_PREREQ(15, 0)
39 .arch_extension flagm /* not necessary */
40#else
41 /* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
42 For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
43 recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
44 work for v15 and is enabled by default it seems. */
45# ifdef RT_OS_DARWIN
46 .cpu apple-a14+crc
47# else
48 .cpu cortex-a53+flagm
49# endif
50#endif
51
52
53.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54 /*
55 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56 */
57 eor \regTmp, \regResult, \regResult, LSR #4
58 eor \regTmp, \regTmp, \regTmp, LSR #2
59 eor \regTmp, \regTmp, \regTmp, LSR #1
60 eor \regTmp, \regTmp, #1
61 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62.endm
63
64
65.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66 /*
67 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
68 */
69 eor \regTmp, \regLeft, \regRight
70 eor \regTmp, \regTmp, \regResult
71 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73.endm
74
75.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76 /*
77 * Translate the arm NZCV bits into corresponding EFLAGS bits.
78 */
79 .if \fSkipFlags == 0 || \fSkipFlags == X86_EFL_OF
80#if 0
81 /* Maybe just a tiny bit slow than the next one. */
82 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83 .ifeq \fSkipFlags & X86_EFL_OF
84 lsr \regTmp, \regTmp, #28
85 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86 lsr \regTmp, \regTmp, #1
87 .else
88 lsr \regTmp, \regTmp, #29
89 .endif
90 eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92 lsr \regTmp, \regTmp, #1
93 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94#else
95 /* This seems to be the faster one... */
96 cfinv
97 mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98 .ifeq (\fSkipFlags & X86_EFL_OF)
99 lsr \regTmp, \regTmp, #28
100 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101 lsr \regTmp, \regTmp, #1
102 .else
103 lsr \regTmp, \regTmp, #29
104 .endif
105 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106 lsr \regTmp, \regTmp, #1
107 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108#endif
109 .else
110 /* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111 .ifeq \fSkipFlags & X86_EFL_ZF
112 cset \regTmp, eq
113 bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114 .endif
115 .ifeq \fSkipFlags & X86_EFL_CF
116 cset \regTmp, cc
117 bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118 .endif
119 .ifeq \fSkipFlags & X86_EFL_OF
120 cset \regTmp, vs
121 bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122 .endif
123 .ifeq \fSkipFlags & X86_EFL_SF
124 cset \regTmp, mi
125 bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126 .endif
127 .endif
128
129
130 /*
131 * Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132 */
133 eor \regTmp, \regResult, \regResult, LSR #4
134 eor \regTmp, \regTmp, \regTmp, LSR #2
135 eor \regTmp, \regTmp, \regTmp, LSR #1
136 eor \regTmp, \regTmp, #1
137 bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139 /*
140 * Auxilary carry / borrow flag. This is related to 8-bit BCD.
141 */
142 eor \regTmp, \regLeft, \regRight
143 eor \regTmp, \regTmp, \regResult
144 lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145 bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147 /* done */
148.endm
149
150
151BEGINCODE
152
153
154
155/* Some sketches.
156
157// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg));
158BEGINPROC_HIDDEN iemAImpl_xchg_u8_locked
159 ldrb w2, [x1]
160 swpalb w2, w2, [x0]
161 strb w2, [x1]
162 ret
163
164// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *pu16Mem, uint16_t *pu16Reg));
165BEGINPROC_HIDDEN iemAImpl_xchg_u16_locked
166 ldrh w2, [x1]
167 swpalh w2, w2, [x0]
168 strh w2, [x1]
169 ret
170
171// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *pu32Mem, uint32_t *pu32Reg));
172// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *pu64Mem, uint64_t *pu64Reg));
173
174*/
175
176
177/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t *pu8Mem, uint8_t *pu8Reg)); */
178
179/*
180 * The CMP instruction.
181 */
182
183/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc); */
184ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
185BEGINPROC_HIDDEN iemAImpl_sub_u8
186 .cfi_startproc
187 /* Do the subtraction. */
188 ldrb w8, [x1]
189 /*and w2, w2, #0xff - should not be necessary. */
190 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
191 strb w9, [x1]
192 setf8 w9
193
194 /* Calculate EFLAGS (passed in and returned via x0). */
195 and w9, w9, #0xffff
196 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
197
198 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
199 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
200 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
201 eor w12, w8, w9
202 and w11, w12, w11
203 lsr w11, w11, #7
204 bfi w0, w11, #X86_EFL_OF_BIT, #1
205
206 ret
207 .cfi_endproc
208
209
210/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc); */
211ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
212BEGINPROC_HIDDEN iemAImpl_sub_u16
213 .cfi_startproc
214 /* Do the subtraction. */
215 ldrh w8, [x1]
216 /*and w2, w2, #0xffff - should not be necessary. */
217 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
218 setf16 w9
219 strh w9, [x1]
220
221 /* Calculate EFLAGS (passed in and returned via x0). */
222 and w9, w9, #0xffff
223 CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
224
225 /* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
226 figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
227 eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
228 eor w12, w8, w9
229 and w11, w12, w11
230 lsr w11, w11, #15
231 bfi w0, w11, #X86_EFL_OF_BIT, #1
232
233 ret
234 .cfi_endproc
235
236
237/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc); */
238ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
239BEGINPROC_HIDDEN iemAImpl_sub_u32
240 .cfi_startproc
241 /* Do the subtraction. */
242 ldr w8, [x1]
243 subs w9, w8, w2 /* w9 = w8 (*puDst) - w2 (uSrc) */
244 str w9, [x1]
245
246 /* Calculate EFLAGS (passed in and returned via x0). */
247
248#if 0
249 /* Translate the arm NZCV bits into corresponding EFLAGS bits. */
250#if 0 /* maybe just a tiny bit slow than the next one. */
251 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
252 lsr w11, w11, #28
253 bfi w0, w11, #X86_EFL_OF_BIT, #1
254 lsr w11, w11, #1
255 eor w11, w11, #1 /* inverts the carry flag to x86 style. */
256 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
257 lsr w11, w11, #1
258 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
259#elif 1 /* seems the faster one... */
260 cfinv
261 mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
262 lsr w11, w11, #28
263 bfi w0, w11, #X86_EFL_OF_BIT, #1
264 lsr w11, w11, #1
265 bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
266 lsr w11, w11, #1
267 bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
268#else
269 cset w11, eq
270 bfi w0, w11, #X86_EFL_ZF_BIT, #1
271 cset w11, cc
272 bfi w0, w11, #X86_EFL_CF_BIT, #1
273 cset w11, vs
274 bfi w0, w11, #X86_EFL_OF_BIT, #1
275 cset w11, mi
276 bfi w0, w11, #X86_EFL_SF_BIT, #1
277#endif
278
279 /* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
280 eor w11, w9, w9, LSR #4
281 eor w11, w11, w11, LSR #2
282 eor w11, w11, w11, LSR #1
283 eor w11, w11, #1
284 bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
285
286 /* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
287 eor w11, w8, w2
288 eor w11, w11, w9
289 lsr w11, w11, #X86_EFL_AF_BIT
290 bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
291#else
292 CALC_EFLAGS x0, x9, x8, x2, x11
293#endif
294
295 ret
296 .cfi_endproc
297
298
299/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc); */
300ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
301BEGINPROC_HIDDEN iemAImpl_sub_u64
302 .cfi_startproc
303 /* Do the subtraction. */
304 ldr x8, [x1]
305 subs x9, x8, x2 /* x9 = x8 (*puDst) - x2 (uSrc) */
306 str x9, [x1]
307
308 /* Calculate EFLAGS (passed in and returned via x0). */
309 CALC_EFLAGS x0, x9, x8, x2, x11
310
311 ret
312 .cfi_endproc
313
314
315
316/*
317 * Shift Left.
318 */
319
320/* uint32_t iemAImpl_shl_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
321/* uint32_t iemAImpl_shl_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
322/* uint32_t iemAImpl_shl_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
323.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
324ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
325BEGINPROC_HIDDEN \a_Name
326 .cfi_startproc
327
328 /* Do we need to shift anything at all? */
329 and w2, w2, #0x1f
330 cbz w2, 99f
331
332 /*
333 * Do the shifting
334 */
335 ldr\a_LdStSuff w8, [x1]
336.ifne \a_cBits < 32
337 lslv w9, w8, w2
338.else
339 lslv x9, x8, x2 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
340.endif
341 str\a_LdStSuff w9, [x1]
342
343 /*
344 * Calculate EFLAGS.
345 */
346 CALC_EFLAGS_PARITY w0, w9, w12
347
348.ifne \a_cBits < 32
349 setf\a_cBits w9 /* Sets NZ */
350.else
351 ands wzr, w9, w9 /* Sets NZ */
352.endif
353#if 1
354 mrs x11, NZCV
355 lsr w11, w11, #30 /* N=1; Z=0 */
356 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
357#else
358 cset x11, eq
359 bfi w0, w11, X86_EFL_ZF_BIT, 1
360 cset x12, pl
361 bfi w0, w12, X86_EFL_SF_BIT, 1
362#endif
363
364.ifne \a_cBits < 32
365 bfxil w0, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
366.else
367 bfxil x0, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
368.endif
369
370.ifne \a_fIntelFlags
371 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
372 eor w11, w8, w8, LSL #1
373 lsr w11, w11, #(\a_cBits - 1)
374 bfi w0, w11, #X86_EFL_OF_BIT, #1
375
376 and w0, w0, ~X86_EFL_AF /* AF is cleared */
377.else
378 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
379 .ifne \a_cBits < 32
380 eor w11, w9, w9, LSR #1
381 lsr w11, w11, #(\a_cBits - 1)
382 .else
383 eor x11, x9, x9, LSR #1
384 lsr x11, x11, #(\a_cBits - 1)
385 .endif
386 bfi w0, w11, #X86_EFL_OF_BIT, #1
387
388 orr w0, w0, X86_EFL_AF /* AF is set */
389.endif
390
39199:
392 ret
393 .cfi_endproc
394.endm
395
396SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
397SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
398SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
399
400SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
401SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
402SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
403
404SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
405SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
406SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
407
408/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
409/* uint32_t iemAImpl_shl_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
410.macro SHL_64, a_Name, a_fIntelFlags
411ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
412BEGINPROC_HIDDEN \a_Name
413 .cfi_startproc
414
415 /* Do we need to shift anything at all? */
416 and w2, w2, #0x3f
417 cbz w2, 99f
418
419 /*
420 * Do the shifting
421 */
422 ldr x8, [x1]
423 lslv x9, x8, x2
424 str x9, [x1]
425
426 /*
427 * Calculate EFLAGS.
428 */
429 CALC_EFLAGS_PARITY w0, w9, w11
430
431 ands xzr, x9, x9 /* Sets NZ */
432 mrs x11, NZCV
433 lsr w11, w11, #30 /* N=1; Z=0 */
434 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
435
436 neg w11, w2 /* the shift count is MODed by the data size, so this is safe. */
437 lsrv x11, x8, x11
438 bfi w0, w11, X86_EFL_CF_BIT, 1
439
440.ifne \a_fIntelFlags
441 /* Intel: OF = first bit shifted: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
442 eor x11, x8, x8, LSL #1
443 lsr x11, x11, #63
444 bfi w0, w11, #X86_EFL_OF_BIT, #1
445
446 and w0, w0, ~X86_EFL_AF /* AF is cleared */
447.else
448 /* AMD: OF = last bit shifted: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
449 eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
450 bfi w0, w11, #X86_EFL_OF_BIT, #1
451
452 orr w0, w0, X86_EFL_AF /* AF is set */
453.endif
45499:
455 ret
456 .cfi_endproc
457.endm
458
459SHL_64 iemAImpl_shl_u64, 1
460SHL_64 iemAImpl_shl_u64_intel, 1
461SHL_64 iemAImpl_shl_u64_amd, 0
462
463
464/*
465 * Shift Right, Unsigned.
466 */
467
468/* uint32_t iemAImpl_shr_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
469/* uint32_t iemAImpl_shr_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
470/* uint32_t iemAImpl_shr_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
471.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
472ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
473BEGINPROC_HIDDEN \a_Name
474 .cfi_startproc
475
476 /* Do we need to shift anything at all? */
477 and w2, w2, #0x1f
478 cbz w2, 99f
479
480 /*
481 * Do the shifting.
482 */
483 ldr\a_LdStSuff w8, [x1]
484 lsrv w9, w8, w2
485 str\a_LdStSuff w9, [x1]
486
487 /*
488 * Calculate EFLAGS.
489 */
490 sub w11, w2, #1
491 lsrv w11, w8, w11
492 bfxil w0, w11, #X86_EFL_CF_BIT, #1
493
494.ifne \a_fIntelFlags
495 and w0, w0, ~X86_EFL_AF /* AF is cleared */
496 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
497 lsr w11, w8, #(\a_cBits - 1)
498 bfi w0, w11, #X86_EFL_OF_BIT, #1
499.else
500 orr w0, w0, X86_EFL_AF /* AF is set */
501 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
502 lsr w11, w9, #(\a_cBits - 2)
503 bfi w0, w11, #X86_EFL_OF_BIT, #1
504.endif
505
506 CALC_EFLAGS_PARITY w0, w9, w11
507
508.ifne \a_cBits < 32
509 setf\a_cBits w9 /* Sets NZ */
510.else
511 ands wzr, w9, w9 /* Sets NZ */
512.endif
513 mrs x11, NZCV
514 lsr w11, w11, #30 /* N=1; Z=0 */
515 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
516
51799:
518 ret
519 .cfi_endproc
520.endm
521
522shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
523shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
524shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
525
526shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
527shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
528shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
529
530shr_8_16_32 iemAImpl_shr_u32, 32, 1,
531shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
532shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
533
534/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
535/* void iemAImpl_shr_u64(uint16_t *pu64Dst, uint8_t cShift, uint32_t *pEFlags); */
536.macro shr_64, a_Name, a_fIntelFlags
537ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
538BEGINPROC_HIDDEN \a_Name
539 .cfi_startproc
540
541 /* Do we need to shift anything at all? */
542 ands w2, w2, #0x3f
543 b.eq 99f
544
545 /*
546 * Do the shifting
547 */
548 ldr x8, [x1]
549 lsrv x9, x8, x2
550 str x9, [x1]
551
552 /*
553 * Calculate EFLAGS.
554 */
555 sub w11, w2, #1
556 lsrv x11, x8, x11
557 bfxil w0, w11, #X86_EFL_CF_BIT, #1
558
559.ifne \a_fIntelFlags
560 and w0, w0, ~X86_EFL_AF /* AF is cleared */
561 /* Intel: OF = one bit shift: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
562 lsr x11, x8, #63
563 bfi w0, w11, #X86_EFL_OF_BIT, #1
564.else
565 orr w0, w0, X86_EFL_AF /* AF is set */
566 /* AMD: OF = last bits shifted: fEfl |= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
567 lsr x11, x9, #62
568 bfi w0, w11, #X86_EFL_OF_BIT, #1
569.endif
570
571 CALC_EFLAGS_PARITY w0, w9, w11
572
573 ands xzr, x9, x9 /* Sets NZ */
574 mrs x11, NZCV
575 lsr w11, w11, #30 /* N=1; Z=0 */
576 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
577
57899:
579 ret
580 .cfi_endproc
581.endm
582
583shr_64 iemAImpl_shr_u64, 1
584shr_64 iemAImpl_shr_u64_intel, 1
585shr_64 iemAImpl_shr_u64_amd, 0
586
587
588/*
589 * Shift Right, Signed
590 */
591
592/* uint32_t iemAImpl_sar_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
593/* uint32_t iemAImpl_sar_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
594/* uint32_t iemAImpl_sar_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
595.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
596ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
597BEGINPROC_HIDDEN \a_Name
598 .cfi_startproc
599
600 /* Do we need to shift anything at all? */
601 and w2, w2, #0x1f
602 cbz w2, 99f
603
604 /*
605 * Do the shifting.
606 */
607 ldr\a_LdSuff w8, [x1] /* Sign-extending for 8 and 16 bits! */
608 asrv w9, w8, w2
609 str\a_StSuff w9, [x1]
610
611 /*
612 * Calculate EFLAGS.
613 */
614 sub w11, w2, #1
615 lsrv w11, w8, w11
616 bfxil w0, w11, #X86_EFL_CF_BIT, #1
617
618.ifne \a_fIntelFlags
619 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
620 and w0, w0, w11 /* AF and OF are cleared */
621.else
622 orr w0, w0, X86_EFL_AF /* AF is set */
623 and w0, w0, ~X86_EFL_OF /* OF is cleared */
624.endif
625
626 CALC_EFLAGS_PARITY w0, w9, w11
627
628.ifne \a_cBits < 32
629 setf\a_cBits w9 /* Sets NZ */
630.else
631 ands wzr, w9, w9 /* Sets NZ */
632.endif
633 mrs x11, NZCV
634 lsr w11, w11, #30 /* N=1; Z=0 */
635 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
636
63799:
638 ret
639 .cfi_endproc
640.endm
641
642sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
643sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
644sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
645
646sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
647sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
648sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
649
650sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
651sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
652sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
653
654/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
655/* uint32_t iemAImpl_sar_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
656.macro sar_64, a_Name, a_fIntelFlags
657ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
658BEGINPROC_HIDDEN \a_Name
659 .cfi_startproc
660
661 /* Do we need to shift anything at all? */
662 ands w2, w2, #0x3f
663 b.eq 99f
664
665 /*
666 * Do the shifting
667 */
668 ldr x8, [x1]
669 asrv x9, x8, x2
670 str x9, [x1]
671
672 /*
673 * Calculate EFLAGS.
674 */
675 sub w11, w2, #1
676 lsrv x11, x8, x11
677 bfxil w0, w11, #X86_EFL_CF_BIT, #1
678
679.ifne \a_fIntelFlags
680 mov w11, ~(X86_EFL_AF | X86_EFL_OF)
681 and w0, w0, w11 /* AF and OF are cleared */
682.else
683 orr w0, w0, X86_EFL_AF /* AF is set */
684 and w0, w0, ~X86_EFL_OF /* OF is cleared */
685.endif
686
687 CALC_EFLAGS_PARITY w0, w9, w11
688
689 ands xzr, x9, x9 /* Sets NZ */
690 mrs x11, NZCV
691 lsr w11, w11, #30 /* N=1; Z=0 */
692 bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
693
69499:
695 ret
696 .cfi_endproc
697.endm
698
699sar_64 iemAImpl_sar_u64, 1
700sar_64 iemAImpl_sar_u64_intel, 1
701sar_64 iemAImpl_sar_u64_amd, 0
702
703
704/*
705 * Rotate Left.
706 */
707
708/* uint32_t iemAImpl_rol_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
709/* uint32_t iemAImpl_rol_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
710/* uint32_t iemAImpl_rol_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
711.macro ROL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
712ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
713BEGINPROC_HIDDEN \a_Name
714 .cfi_startproc
715
716 /* Do we need to rotate anything at all? */
717 and w2, w2, #0x1f
718 cbz w2, 99f
719
720 /*
721 * Do the shifting
722 */
723.ifne \a_cBits < 32
724 and w2, w2, #(\a_cBits - 1)
725 neg w3, w2 /* the count is MODed by the data size, so this is safe. */
726 ldr\a_LdStSuff w8, [x1]
727 orr w8, w8, w8, LSL #(32 - \a_cBits) /* place a copy of the value at the top of the register, ready to be roated in */
728 rorv w9, w8, w3
729 str\a_LdStSuff w9, [x1]
730.else
731 neg w3, w2 /* the count is MODed by the data size, so this is safe. */
732 ldr\a_LdStSuff w8, [x1]
733 rorv w9, w8, w3
734 str\a_LdStSuff w9, [x1]
735.endif
736
737 /*
738 * Calculate EFLAGS - only CF and OF.
739 */
740 bfi w0, w9, #0, #1 /* CF = last bit rotated around (new bottom bit) */
741
742.ifne \a_fIntelFlags
743 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
744 eor w11, w8, w8, LSL #1
745 lsr w11, w11, #(\a_cBits - 1)
746 bfi w0, w11, #X86_EFL_OF_BIT, #1
747.else
748 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
749 eor w11, w0, w9, LSR #(\a_cBits - 1)
750 bfi w0, w11, #X86_EFL_OF_BIT, #1
751.endif
752
75399:
754 ret
755 .cfi_endproc
756.endm
757
758ROL_8_16_32 iemAImpl_rol_u8, 8, 1, b
759ROL_8_16_32 iemAImpl_rol_u8_intel, 8, 1, b
760ROL_8_16_32 iemAImpl_rol_u8_amd, 8, 0, b
761
762ROL_8_16_32 iemAImpl_rol_u16, 16, 1, h
763ROL_8_16_32 iemAImpl_rol_u16_intel, 16, 1, h
764ROL_8_16_32 iemAImpl_rol_u16_amd, 16, 0, h
765
766ROL_8_16_32 iemAImpl_rol_u32, 32, 1,
767ROL_8_16_32 iemAImpl_rol_u32_intel, 32, 1,
768ROL_8_16_32 iemAImpl_rol_u32_amd, 32, 0,
769
770/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
771/* uint32_t iemAImpl_rol_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
772.macro ROL_64, a_Name, a_fIntelFlags
773ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
774BEGINPROC_HIDDEN \a_Name
775 .cfi_startproc
776
777 /* Do we need to shift anything at all? */
778 and w2, w2, #0x3f
779 cbz w2, 99f
780
781 /*
782 * Do the shifting
783 */
784 neg w3, w2
785 ldr x8, [x1]
786 rorv x9, x8, x3
787 str x9, [x1]
788
789 /*
790 * Calculate EFLAGS - only CF and OF.
791 */
792 bfi w0, w9, #0, #1 /* CF = last bit rotated around */
793
794.ifne \a_fIntelFlags
795 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
796 eor x11, x8, x8, LSL #1
797 lsr x11, x11, #(64 - 1)
798 bfi w0, w11, #X86_EFL_OF_BIT, #1
799.else
800 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
801 eor x11, x0, x9, LSR #(64 - 1)
802 bfi w0, w11, #X86_EFL_OF_BIT, #1
803.endif
804
80599:
806 ret
807 .cfi_endproc
808.endm
809
810ROL_64 iemAImpl_rol_u64, 1
811ROL_64 iemAImpl_rol_u64_intel, 1
812ROL_64 iemAImpl_rol_u64_amd, 0
813
814
815/*
816 * Rotate Right.
817 */
818
819/* uint32_t iemAImpl_ror_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
820/* uint32_t iemAImpl_ror_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
821/* uint32_t iemAImpl_ror_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
822.macro ROR_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
823ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
824BEGINPROC_HIDDEN \a_Name
825 .cfi_startproc
826
827 /* Do we need to rotate anything at all? */
828 and w2, w2, #0x1f
829 cbz w2, 99f
830
831 /*
832 * Do the shifting
833 */
834.ifne \a_cBits < 32
835 and w2, w2, #(\a_cBits - 1)
836 ldr\a_LdStSuff w8, [x1]
837 orr w8, w8, w8, LSL #(\a_cBits) /* duplicate value above, so it is ready to be shifted in. */
838 lsrv w9, w8, w2
839 str\a_LdStSuff w9, [x1]
840.else
841 ldr\a_LdStSuff w8, [x1]
842 rorv w9, w8, w2
843 str\a_LdStSuff w9, [x1]
844.endif
845
846 /*
847 * Calculate EFLAGS - only CF and OF.
848 */
849 bfxil w0, w9, #(\a_cBits - 1), #1 /* CF = last bit rotated around (new top bit) */
850
851.ifne \a_fIntelFlags
852 /* Intel: OF = first rotate step: X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); */
853 eor w11, w8, w8, LSR #(\a_cBits - 1)
854 bfi w0, w11, #X86_EFL_OF_BIT, #1
855.else
856 /* AMD: OF = last rotate step: fEFlags |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; */
857 eor w11, w0, w9, LSR #(\a_cBits - 2)
858 bfi w0, w11, #X86_EFL_OF_BIT, #1
859.endif
860
86199:
862 ret
863 .cfi_endproc
864.endm
865
866ROR_8_16_32 iemAImpl_ror_u8, 8, 1, b
867ROR_8_16_32 iemAImpl_ror_u8_intel, 8, 1, b
868ROR_8_16_32 iemAImpl_ror_u8_amd, 8, 0, b
869
870ROR_8_16_32 iemAImpl_ror_u16, 16, 1, h
871ROR_8_16_32 iemAImpl_ror_u16_intel, 16, 1, h
872ROR_8_16_32 iemAImpl_ror_u16_amd, 16, 0, h
873
874ROR_8_16_32 iemAImpl_ror_u32, 32, 1,
875ROR_8_16_32 iemAImpl_ror_u32_intel, 32, 1,
876ROR_8_16_32 iemAImpl_ror_u32_amd, 32, 0,
877
878/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
879/* uint32_t iemAImpl_ror_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
880.macro ROR_64, a_Name, a_fIntelFlags
881ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
882BEGINPROC_HIDDEN \a_Name
883 .cfi_startproc
884
885 /* Do we need to shift anything at all? */
886 and w2, w2, #0x3f
887 cbz w2, 99f
888
889 /*
890 * Do the shifting
891 */
892 ldr x8, [x1]
893 rorv x9, x8, x2
894 str x9, [x1]
895
896 /*
897 * Calculate EFLAGS - only CF and OF.
898 */
899 bfxil x0, x9, #(64 - 1), #1 /* CF = last bit rotated around (new top bit) */
900
901.ifne \a_fIntelFlags
902 /* Intel: OF = first rotate step: X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); */
903 eor x11, x8, x8, LSR #(64 - 1)
904 bfi w0, w11, #X86_EFL_OF_BIT, #1
905.else
906 /* AMD: OF = last rotate step: fEFlags |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; */
907 eor x11, x0, x9, LSR #(64 - 2)
908 bfi w0, w11, #X86_EFL_OF_BIT, #1
909.endif
910
91199:
912 ret
913 .cfi_endproc
914.endm
915
916ROR_64 iemAImpl_ror_u64, 1
917ROR_64 iemAImpl_ror_u64_intel, 1
918ROR_64 iemAImpl_ror_u64_amd, 0
919
920
921/*
922 * Rotate Left thru Carry.
923 */
924
925/* uint32_t iemAImpl_rcl_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
926/* uint32_t iemAImpl_rcl_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
927/* uint32_t iemAImpl_rcl_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
928.macro RCL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
929ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
930BEGINPROC_HIDDEN \a_Name
931 .cfi_startproc
932
933 /* Do we need to rotate anything at all? */
934 and w2, w2, #0x1f
935.ifne \a_cBits >= 32
936 cbz w2, 99f
937.else
938 .ifeq \a_fIntelFlags
939 cbz w2, 99f /* AMD */
940 .endif
941
942 /*
943 * 8 and 16 bit: w2 = w2 % (a_cBits + 1).
944 *
945 * Given that the w2 range is 0 thru 31, the 16-bit case can be reduced
946 * to:
947 * w2 = w2 >= 17 ? w2 - 17 : w2
948 *
949 * In the 8-bit scenario we're modding with 9, so we need to do it in
950 * two steps:
951 * w2 = w2 >= 18 ? w2 - 18 : w2
952 * w2 = w2 >= 9 ? w2 - 9 : w2
953 *
954 * For comparison clang generates the following for 16-bit:
955 * mov w9, #0xf0f0f0f1
956 * umull x9, w2, w9
957 * lsr x9, x9, #36
958 * bfi w9, w9, #4, #1
959 * sub w2, w2, w9
960 *
961 * The 8-bit variant is differs only in the constants used:
962 * mov w9, #0x38e38e39
963 * umull x9, w2, w9
964 * lsr x9, x9, #33
965 * bfi w9, w9, #3, #2
966 * subs w8, w2, w9
967 */
968 .ifne \a_cBits == 16
969 subs w3, w2, #17
970 csel w2, w3, w2, hs
971 .else
972 subs w3, w2, #18
973 csel w2, w3, w2, hs
974 subs w3, w2, #9
975 csel w2, w3, w2, hs
976 .endif
977 .ifne \a_fIntelFlags
978 cbz w2, 99f /* Intel: Skip everything if the modded rotate count is zero. */
979 .endif
980.endif
981
982 /*
983 * Do the rotating: x9 = RORV(w8[0:a_cBits-1] | (CF << 63) | (w8[1:a_cBits-1] << (64-a_cBits-1)) | (CF << a_cBits)), -w2)
984 */
985 neg w2, w2 /* w3 = rorv count - this will be masked by 0x3f so it's the same as 64-w2. */
986
987 ldr\a_LdStSuff w8, [x1]
988 .ifne \a_cBits < 32
989 orr x8, x8, x8, LSL #(64 - \a_cBits - 1)
990 .ifeq \a_fIntelFlags
991 bfi x8, x0, #(\a_cBits), #1 /* AMD: w8[a_cBits] = CF; Avoids conditional branch for CF calc to cover cShift==0. */
992 .endif
993 .else
994 lsr w9, w8, #1
995 orr x8, x8, x9, LSL #(64 - \a_cBits)
996 .endif
997 bfi x8, x0, #63, #1 /* w8[63] = CF */
998 rorv x9, x8, x2
999 str\a_LdStSuff w9, [x1]
1000
1001 /*
1002 * Calculate EFLAGS - only CF and OF.
1003 */
1004 bfxil x0, x9, #(\a_cBits), #1 /* CF = last bit rotated 'out' */
1005
1006.ifne \a_fIntelFlags
1007 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1008 eor w11, w8, w8, LSL #1
1009 lsr w11, w11, #(\a_cBits - 1)
1010 bfi w0, w11, #X86_EFL_OF_BIT, #1
1011.else
1012 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1013 eor w11, w0, w9, LSR #(\a_cBits - 1)
1014 bfi w0, w11, #X86_EFL_OF_BIT, #1
1015.endif
1016
101799:
1018 ret
1019 .cfi_endproc
1020.endm
1021
1022RCL_8_16_32 iemAImpl_rcl_u8, 8, 1, b
1023RCL_8_16_32 iemAImpl_rcl_u8_intel, 8, 1, b
1024RCL_8_16_32 iemAImpl_rcl_u8_amd, 8, 0, b
1025
1026RCL_8_16_32 iemAImpl_rcl_u16, 16, 1, h
1027RCL_8_16_32 iemAImpl_rcl_u16_intel, 16, 1, h
1028RCL_8_16_32 iemAImpl_rcl_u16_amd, 16, 0, h
1029
1030RCL_8_16_32 iemAImpl_rcl_u32, 32, 1,
1031RCL_8_16_32 iemAImpl_rcl_u32_intel, 32, 1,
1032RCL_8_16_32 iemAImpl_rcl_u32_amd, 32, 0,
1033
1034/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
1035/* uint32_t iemAImpl_rcl_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
1036.macro RCL_64, a_Name, a_fIntelFlags
1037ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1038BEGINPROC_HIDDEN \a_Name
1039 .cfi_startproc
1040
1041 /* Do we need to shift anything at all? */
1042 and w2, w2, #0x3f
1043 cbz w2, 99f /** @todo eliminate this for < 32 shift with intel flags */
1044
1045 /*
1046 * Do the rotating: (w8 << w2) | (CF << (w2 - 1)) | (w2 > 1 ? (w8 >> (64 - w2 + 1)) : 0)
1047 */
1048 and w3, w0, #X86_EFL_CF
1049 subs w4, w2, #1 /* Also: prep for 'w2 > 1' (w2 can't be zero, btw) - think: cmp w2, #1 */
1050 lslv x3, x3, x4 /* x3 = CF << (w2 - 1) */
1051
1052 mov w4, #(64 + 1)
1053 sub w4, w4, w2 /* w4 = 64 - w2 + 1 */
1054
1055 ldr x8, [x1]
1056 lslv x9, x8, x2
1057 lsrv x10, x8, x4
1058 csel x10, xzr, x10, eq /* if w2 == 1: x10 = 0; else: x10 = x8 >> (64 - w2 + 1); */
1059 orr x9, x9, x3 /* shifted CF */
1060 orr x9, x9, x10
1061 str x9, [x1]
1062
1063 /*
1064 * Calculate EFLAGS - only CF and OF.
1065 */
1066 neg x11, x2
1067 lsr x11, x8, x11
1068 bfi w0, w11, #0, #1 /* CF = last bit rotated out. */
1069
1070.ifne \a_fIntelFlags
1071 /* Intel: OF = first rotate step: fEfl |= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1072 eor x11, x8, x8, LSL #1
1073 lsr x11, x11, #(64 - 1)
1074 bfi w0, w11, #X86_EFL_OF_BIT, #1
1075.else
1076 /* AMD: OF = last rotate step: fEfl |= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1077 eor x11, x0, x9, LSR #(64 - 1)
1078 bfi w0, w11, #X86_EFL_OF_BIT, #1
1079.endif
1080
108199:
1082 ret
1083 .cfi_endproc
1084.endm
1085
1086RCL_64 iemAImpl_rcl_u64, 1
1087RCL_64 iemAImpl_rcl_u64_intel, 1
1088RCL_64 iemAImpl_rcl_u64_amd, 0
1089
1090
1091/*
1092 * Rotate Right thru Carry.
1093 */
1094
1095/* uint32_t iemAImpl_rcr_u8( uint32_t fEFlagsIn, uint8_t *pu8Dst, uint8_t cShift); */
1096/* uint32_t iemAImpl_rcr_u16(uint32_t fEFlagsIn, uint16_t *pu16Dst, uint8_t cShift); */
1097/* uint32_t iemAImpl_rcr_u32(uint32_t fEFlagsIn, uint16_t *pu32Dst, uint8_t cShift); */
1098.macro RCR_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
1099ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1100BEGINPROC_HIDDEN \a_Name
1101 .cfi_startproc
1102
1103 /* Do we need to rotate anything at all? */
1104 and w2, w2, #0x1f
1105.ifne \a_cBits >= 32
1106 cbz w2, 99f
1107.else
1108 .ifeq \a_fIntelFlags
1109 cbz w2, 99f /* AMD */
1110 .endif
1111
1112 /*
1113 * 8 and 16 bit: w2 = w2 % (a_cBits + 1). (See RCL for details.)
1114 */
1115 .ifne \a_cBits == 16
1116 subs w3, w2, #17
1117 csel w2, w3, w2, hs
1118 .else
1119 subs w3, w2, #18
1120 csel w2, w3, w2, hs
1121 subs w3, w2, #9
1122 csel w2, w3, w2, hs
1123 .endif
1124 .ifne \a_fIntelFlags
1125 cbz w2, 99f /* Intel: Skip everything if the modded rotate count is zero. */
1126 .endif
1127.endif
1128
1129 /*
1130 * Do the rotating: x9 = RORV(x8[0:a_cBits-1] | (CF << a_cBits) | ((x8 << (a_cBits + 2)) >> 1) | (CF << 63), x2)
1131 */
1132 add w3, w2, #1 /* w3 = w2 + 1 */
1133
1134 subs w4, w2, #1
1135 mov w5, #(\a_cBits)
1136 csel w4, w5, w5, lo /* w4 = w2 >= 1 ? w2 - 1 : a_cBits - for CF extraction */
1137
1138 ldr\a_LdStSuff w8, [x1]
1139 bfi x8, x0, #(\a_cBits), #1 /* Put CF above the input. */
1140 bfi x8, x8, #(\a_cBits + 1), #(64 - \a_cBits - 1) /* Put repeat the register content above that again. */
1141.ifne \a_cBits < 32
1142 .ifeq \a_fIntelFlags
1143 bfi x8, x0, #63, #1 /* AMD 8- and 16-bit: Put CF at the very top so w2 == 0 works w/o branching. */
1144 .endif
1145.endif
1146 rorv x9, x8, x2
1147 str\a_LdStSuff w9, [x1]
1148
1149 /*
1150 * Calculate EFLAGS - only CF and OF.
1151 */
1152 bfxil x0, x9, #63, #1 /* CF = last bit rotated 'out' */
1153
1154.ifne \a_fIntelFlags
1155 /* Intel: OF = first rotate step: fEFlags |= (fInCarry ^ (uint32_t)(uDst >> (a_cBits - 1))) << X86_EFL_OF_BIT; */
1156 eor x11, x8, x8, LSR #1 /* We've got CF in bit #a_cBits in x8 */
1157 lsr w11, w11, #(\a_cBits - 1)
1158 bfi w0, w11, #X86_EFL_OF_BIT, #1
1159.else
1160 /* AMD: OF = last rotate step: fEFlags |= X86_EFL_GET_OF_ ## a_cBits(uResult ^ (uResult << 1)); */
1161 eor w11, w9, w9, LSL #1
1162 lsr w11, w11, #(\a_cBits - 1)
1163 bfi w0, w11, #X86_EFL_OF_BIT, #1
1164.endif
1165
116699:
1167 ret
1168 .cfi_endproc
1169.endm
1170
1171RCR_8_16_32 iemAImpl_rcr_u8, 8, 1, b
1172RCR_8_16_32 iemAImpl_rcr_u8_intel, 8, 1, b
1173RCR_8_16_32 iemAImpl_rcr_u8_amd, 8, 0, b
1174
1175RCR_8_16_32 iemAImpl_rcr_u16, 16, 1, h
1176RCR_8_16_32 iemAImpl_rcr_u16_intel, 16, 1, h
1177RCR_8_16_32 iemAImpl_rcr_u16_amd, 16, 0, h
1178
1179RCR_8_16_32 iemAImpl_rcr_u32, 32, 1,
1180RCR_8_16_32 iemAImpl_rcr_u32_intel, 32, 1,
1181RCR_8_16_32 iemAImpl_rcr_u32_amd, 32, 0,
1182
1183/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
1184/* uint32_t iemAImpl_rcr_u64(uint32_t fEFlagsIn, uint16_t *pu64Dst, uint8_t cShift); */
1185.macro RCR_64, a_Name, a_fIntelFlags
1186ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1187BEGINPROC_HIDDEN \a_Name
1188 .cfi_startproc
1189
1190 /* Do we need to shift anything at all? */
1191 and w2, w2, #0x3f
1192 cbz w2, 99f
1193
1194 /*
1195 * Do the rotating: (w8 >> w2) | (CF << (64 - w2)) | (w2 > 1 ? (w8 << (64 - w2 + 1)) : 0)
1196 */
1197 and w5, w0, #X86_EFL_CF /* x5 = input CF - for intel OF calc */
1198 neg w4, w2
1199 lslv x3, x5, x4 /* x3 = CF << (64 - w2) */
1200
1201 cmp w2, #1 /* prep for w2 > 1 */
1202 add w4, w4, #1 /* w4 = -w2 + 1; which when & 0x3f =^= 64 - 2 + 1 */
1203
1204 ldr x8, [x1]
1205 lsrv x9, x8, x2
1206 lslv x10, x8, x4
1207 csel x10, xzr, x10, eq /* if w2 == 1: x10 = 0; else: x10 = x8 << (64 - w2 + 1); */
1208 orr x9, x9, x3 /* shifted CF */
1209 orr x9, x9, x10
1210 str x9, [x1]
1211
1212 /*
1213 * Calculate EFLAGS - only CF and OF.
1214 */
1215 sub x11, x2, #1
1216 lsr x11, x8, x11
1217 bfi w0, w11, #0, #1 /* CF = last bit rotated out. */
1218
1219.ifne \a_fIntelFlags
1220 /* Intel: OF = first rotate step: fEFlags |= (fInCarry ^ (uint32_t)(uDst >> (a_cBits - 1))) << X86_EFL_OF_BIT; */
1221 eor x11, x5, x8, LSR #63
1222 bfi w0, w11, #X86_EFL_OF_BIT, #1
1223.else
1224 /* AMD: OF = last rotate step: fEFlags |= X86_EFL_GET_OF_ ## a_cBits(uResult ^ (uResult << 1)); */
1225 eor x11, x9, x9, LSL #1
1226 lsr x11, x11, #(64 - 1)
1227 bfi w0, w11, #X86_EFL_OF_BIT, #1
1228.endif
1229
123099:
1231 ret
1232 .cfi_endproc
1233.endm
1234
1235RCR_64 iemAImpl_rcr_u64, 1
1236RCR_64 iemAImpl_rcr_u64_intel, 1
1237RCR_64 iemAImpl_rcr_u64_amd, 0
1238
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use