VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 100594

Last change on this file since 100594 was 100340, checked in by vboxsync, 17 months ago

VMM/IEMAllAImpl.asm: IBT endbr64/32 and notrack related changes. bugref:10406 ticketref:21435

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 187.4 KB
Line 
1; $Id: IEMAllAImpl.asm 100340 2023-07-02 22:50:12Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90 IBT_ENDBRxx
91%endmacro
92
93
94;
95; We employ some macro assembly here to hid the calling convention differences.
96;
97%ifdef RT_ARCH_AMD64
98 %macro PROLOGUE_1_ARGS 0
99 %endmacro
100 %macro EPILOGUE_1_ARGS 0
101 ret
102 %endmacro
103 %macro EPILOGUE_1_ARGS_EX 0
104 ret
105 %endmacro
106
107 %macro PROLOGUE_2_ARGS 0
108 %endmacro
109 %macro EPILOGUE_2_ARGS 0
110 ret
111 %endmacro
112 %macro EPILOGUE_2_ARGS_EX 1
113 ret
114 %endmacro
115
116 %macro PROLOGUE_3_ARGS 0
117 %endmacro
118 %macro EPILOGUE_3_ARGS 0
119 ret
120 %endmacro
121 %macro EPILOGUE_3_ARGS_EX 1
122 ret
123 %endmacro
124
125 %macro PROLOGUE_4_ARGS 0
126 %endmacro
127 %macro EPILOGUE_4_ARGS 0
128 ret
129 %endmacro
130 %macro EPILOGUE_4_ARGS_EX 1
131 ret
132 %endmacro
133
134 %ifdef ASM_CALL64_GCC
135 %define A0 rdi
136 %define A0_32 edi
137 %define A0_16 di
138 %define A0_8 dil
139
140 %define A1 rsi
141 %define A1_32 esi
142 %define A1_16 si
143 %define A1_8 sil
144
145 %define A2 rdx
146 %define A2_32 edx
147 %define A2_16 dx
148 %define A2_8 dl
149
150 %define A3 rcx
151 %define A3_32 ecx
152 %define A3_16 cx
153 %endif
154
155 %ifdef ASM_CALL64_MSC
156 %define A0 rcx
157 %define A0_32 ecx
158 %define A0_16 cx
159 %define A0_8 cl
160
161 %define A1 rdx
162 %define A1_32 edx
163 %define A1_16 dx
164 %define A1_8 dl
165
166 %define A2 r8
167 %define A2_32 r8d
168 %define A2_16 r8w
169 %define A2_8 r8b
170
171 %define A3 r9
172 %define A3_32 r9d
173 %define A3_16 r9w
174 %endif
175
176 %define T0 rax
177 %define T0_32 eax
178 %define T0_16 ax
179 %define T0_8 al
180
181 %define T1 r11
182 %define T1_32 r11d
183 %define T1_16 r11w
184 %define T1_8 r11b
185
186 %define T2 r10 ; only AMD64
187 %define T2_32 r10d
188 %define T2_16 r10w
189 %define T2_8 r10b
190
191%else
192 ; x86
193 %macro PROLOGUE_1_ARGS 0
194 push edi
195 %endmacro
196 %macro EPILOGUE_1_ARGS 0
197 pop edi
198 ret 0
199 %endmacro
200 %macro EPILOGUE_1_ARGS_EX 1
201 pop edi
202 ret %1
203 %endmacro
204
205 %macro PROLOGUE_2_ARGS 0
206 push edi
207 %endmacro
208 %macro EPILOGUE_2_ARGS 0
209 pop edi
210 ret 0
211 %endmacro
212 %macro EPILOGUE_2_ARGS_EX 1
213 pop edi
214 ret %1
215 %endmacro
216
217 %macro PROLOGUE_3_ARGS 0
218 push ebx
219 mov ebx, [esp + 4 + 4]
220 push edi
221 %endmacro
222 %macro EPILOGUE_3_ARGS_EX 1
223 %if (%1) < 4
224 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
225 %endif
226 pop edi
227 pop ebx
228 ret %1
229 %endmacro
230 %macro EPILOGUE_3_ARGS 0
231 EPILOGUE_3_ARGS_EX 4
232 %endmacro
233
234 %macro PROLOGUE_4_ARGS 0
235 push ebx
236 push edi
237 push esi
238 mov ebx, [esp + 12 + 4 + 0]
239 mov esi, [esp + 12 + 4 + 4]
240 %endmacro
241 %macro EPILOGUE_4_ARGS_EX 1
242 %if (%1) < 8
243 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
244 %endif
245 pop esi
246 pop edi
247 pop ebx
248 ret %1
249 %endmacro
250 %macro EPILOGUE_4_ARGS 0
251 EPILOGUE_4_ARGS_EX 8
252 %endmacro
253
254 %define A0 ecx
255 %define A0_32 ecx
256 %define A0_16 cx
257 %define A0_8 cl
258
259 %define A1 edx
260 %define A1_32 edx
261 %define A1_16 dx
262 %define A1_8 dl
263
264 %define A2 ebx
265 %define A2_32 ebx
266 %define A2_16 bx
267 %define A2_8 bl
268
269 %define A3 esi
270 %define A3_32 esi
271 %define A3_16 si
272
273 %define T0 eax
274 %define T0_32 eax
275 %define T0_16 ax
276 %define T0_8 al
277
278 %define T1 edi
279 %define T1_32 edi
280 %define T1_16 di
281%endif
282
283
284;;
285; Load the relevant flags from [%1] if there are undefined flags (%3).
286;
287; @remarks Clobbers T0, stack. Changes EFLAGS.
288; @param A2 The register pointing to the flags.
289; @param 1 The parameter (A0..A3) pointing to the eflags.
290; @param 2 The set of modified flags.
291; @param 3 The set of undefined flags.
292;
293%macro IEM_MAYBE_LOAD_FLAGS 3
294 ;%if (%3) != 0
295 pushf ; store current flags
296 mov T0_32, [%1] ; load the guest flags
297 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
298 and T0_32, (%2 | %3) ; select the modified and undefined flags.
299 or [xSP], T0 ; merge guest flags with host flags.
300 popf ; load the mixed flags.
301 ;%endif
302%endmacro
303
304;;
305; Load the relevant flags from [%1].
306;
307; @remarks Clobbers T0, stack. Changes EFLAGS.
308; @param A2 The register pointing to the flags.
309; @param 1 The parameter (A0..A3) pointing to the eflags.
310; @param 2 The set of flags to load.
311; @param 3 The set of undefined flags.
312;
313%macro IEM_LOAD_FLAGS 3
314 pushf ; store current flags
315 mov T0_32, [%1] ; load the guest flags
316 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
317 and T0_32, (%2 | %3) ; select the modified and undefined flags.
318 or [xSP], T0 ; merge guest flags with host flags.
319 popf ; load the mixed flags.
320%endmacro
321
322;;
323; Update the flag.
324;
325; @remarks Clobbers T0, T1, stack.
326; @param 1 The register pointing to the EFLAGS.
327; @param 2 The mask of modified flags to save.
328; @param 3 The mask of undefined flags to (maybe) save.
329;
330%macro IEM_SAVE_FLAGS 3
331 %if (%2 | %3) != 0
332 pushf
333 pop T1
334 mov T0_32, [%1] ; flags
335 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
336 and T1_32, (%2 | %3) ; select the modified and undefined flags.
337 or T0_32, T1_32 ; combine the flags.
338 mov [%1], T0_32 ; save the flags.
339 %endif
340%endmacro
341
342;;
343; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
344;
345; @remarks Clobbers T0, T1, stack.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 Mask of additional flags to always set.
350;
351%macro IEM_SAVE_AND_ADJUST_FLAGS 4
352 %if (%2 | %3 | %4) != 0
353 pushf
354 pop T1
355 mov T0_32, [%1] ; load flags.
356 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
357 and T1_32, (%2) ; select the modified flags.
358 or T0_32, T1_32 ; combine the flags.
359 %if (%4) != 0
360 or T0_32, %4 ; add the always set flags.
361 %endif
362 mov [%1], T0_32 ; save the result.
363 %endif
364%endmacro
365
366;;
367; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
368; signed input (%4[%5]) and parity index (%6).
369;
370; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
371; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
372; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
373;
374; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
375; @param 1 The register pointing to the EFLAGS.
376; @param 2 The mask of modified flags to save.
377; @param 3 Mask of additional flags to always clear
378; @param 4 The result register to set SF by.
379; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
380; @param 6 The (full) register containing the parity table index. Will be modified!
381
382%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
383 %ifdef RT_ARCH_AMD64
384 pushf
385 pop T2
386 %else
387 push T0
388 pushf
389 pop T0
390 %endif
391 mov T1_32, [%1] ; load flags.
392 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
393 %ifdef RT_ARCH_AMD64
394 and T2_32, (%2) ; select the modified flags.
395 or T1_32, T2_32 ; combine the flags.
396 %else
397 and T0_32, (%2) ; select the modified flags.
398 or T1_32, T0_32 ; combine the flags.
399 pop T0
400 %endif
401
402 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
403 bt %4, %5 - 1
404 jnc %%sf_clear
405 or T1_32, X86_EFL_SF
406 %%sf_clear:
407
408 ; Parity last.
409 and %6, 0xff
410 %ifdef RT_ARCH_AMD64
411 lea T2, [NAME(g_afParity) xWrtRIP]
412 or T1_8, [T2 + %6]
413 %else
414 or T1_8, [NAME(g_afParity) + %6]
415 %endif
416
417 mov [%1], T1_32 ; save the result.
418%endmacro
419
420;;
421; Calculates the new EFLAGS using fixed clear and set bit masks.
422;
423; @remarks Clobbers T0.
424; @param 1 The register pointing to the EFLAGS.
425; @param 2 Mask of additional flags to always clear
426; @param 3 Mask of additional flags to always set.
427;
428%macro IEM_ADJUST_FLAGS 3
429 %if (%2 | %3) != 0
430 mov T0_32, [%1] ; Load flags.
431 %if (%2) != 0
432 and T0_32, ~(%2) ; Remove the always cleared flags.
433 %endif
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 mov [%1], T0_32 ; Save the result.
438 %endif
439%endmacro
440
441;;
442; Calculates the new EFLAGS using fixed clear and set bit masks.
443;
444; @remarks Clobbers T0, %4, EFLAGS.
445; @param 1 The register pointing to the EFLAGS.
446; @param 2 Mask of additional flags to always clear
447; @param 3 Mask of additional flags to always set.
448; @param 4 The (full) register containing the parity table index. Will be modified!
449;
450%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
451 mov T0_32, [%1] ; Load flags.
452 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
453 %if (%3) != 0
454 or T0_32, %3 ; Add the always set flags.
455 %endif
456 and %4, 0xff
457 %ifdef RT_ARCH_AMD64
458 lea T2, [NAME(g_afParity) xWrtRIP]
459 or T0_8, [T2 + %4]
460 %else
461 or T0_8, [NAME(g_afParity) + %4]
462 %endif
463 mov [%1], T0_32 ; Save the result.
464%endmacro
465
466
467;;
468; Checks that the size expression %1 matches %2 adjusted according to
469; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
470; @param 1 The jump array size assembly expression.
471; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
472;
473%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
475 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
476 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
477 %else
478 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
479 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
480 %endif
481%endmacro
482
483
484;*********************************************************************************************************************************
485;* External Symbols *
486;*********************************************************************************************************************************
487extern NAME(g_afParity)
488
489
490;;
491; Macro for implementing a binary operator.
492;
493; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
494; variants, except on 32-bit system where the 64-bit accesses requires hand
495; coding.
496;
497; All the functions takes a pointer to the destination memory operand in A0,
498; the source register operand in A1 and a pointer to eflags in A2.
499;
500; @param 1 The instruction mnemonic.
501; @param 2 Non-zero if there should be a locked version.
502; @param 3 The modified flags.
503; @param 4 The undefined flags.
504;
505%macro IEMIMPL_BIN_OP 4
506BEGINCODE
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64
539 %endif ; RT_ARCH_AMD64
540
541 %if %2 != 0 ; locked versions requested?
542
543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
544 PROLOGUE_3_ARGS
545 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
546 lock %1 byte [A0], A1_8
547 IEM_SAVE_FLAGS A2, %3, %4
548 EPILOGUE_3_ARGS
549ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
550
551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
552 PROLOGUE_3_ARGS
553 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
554 lock %1 word [A0], A1_16
555 IEM_SAVE_FLAGS A2, %3, %4
556 EPILOGUE_3_ARGS
557ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
558
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 dword [A0], A1_32
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS
565ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
566
567 %ifdef RT_ARCH_AMD64
568BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
569 PROLOGUE_3_ARGS
570 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
571 lock %1 qword [A0], A1
572 IEM_SAVE_FLAGS A2, %3, %4
573 EPILOGUE_3_ARGS_EX 8
574ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
575 %endif ; RT_ARCH_AMD64
576 %endif ; locked
577%endmacro
578
579; instr,lock, modified-flags, undefined flags
580IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
581IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
582IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
583IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
584IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
585IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
586IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
587IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
588IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
589
590
591;;
592; Macro for implementing a binary operator, VEX variant with separate input/output.
593;
594; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
595; where the 64-bit accesses requires hand coding.
596;
597; All the functions takes a pointer to the destination memory operand in A0,
598; the first source register operand in A1, the second source register operand
599; in A2 and a pointer to eflags in A3.
600;
601; @param 1 The instruction mnemonic.
602; @param 2 The modified flags.
603; @param 3 The undefined flags.
604;
605%macro IEMIMPL_VEX_BIN_OP 3
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
607 PROLOGUE_4_ARGS
608 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
609 %1 T0_32, A1_32, A2_32
610 mov [A0], T0_32
611 IEM_SAVE_FLAGS A3, %2, %3
612 EPILOGUE_4_ARGS
613ENDPROC iemAImpl_ %+ %1 %+ _u32
614
615 %ifdef RT_ARCH_AMD64
616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
617 PROLOGUE_4_ARGS
618 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
619 %1 T0, A1, A2
620 mov [A0], T0
621 IEM_SAVE_FLAGS A3, %2, %3
622 EPILOGUE_4_ARGS
623ENDPROC iemAImpl_ %+ %1 %+ _u64
624 %endif ; RT_ARCH_AMD64
625%endmacro
626
627; instr, modified-flags, undefined-flags
628IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
629IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
630IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
631
632;;
633; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
634;
635; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
636; where the 64-bit accesses requires hand coding.
637;
638; All the functions takes a pointer to the destination memory operand in A0,
639; the source register operand in A1 and a pointer to eflags in A2.
640;
641; @param 1 The instruction mnemonic.
642; @param 2 The modified flags.
643; @param 3 The undefined flags.
644;
645%macro IEMIMPL_VEX_BIN_OP_2 3
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0_32, [A0]
650 %1 T0_32, A1_32
651 mov [A0], T0_32
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u32
655
656 %ifdef RT_ARCH_AMD64
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
658 PROLOGUE_4_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
660 mov T0, [A0]
661 %1 T0, A1
662 mov [A0], T0
663 IEM_SAVE_FLAGS A2, %2, %3
664 EPILOGUE_4_ARGS
665ENDPROC iemAImpl_ %+ %1 %+ _u64
666 %endif ; RT_ARCH_AMD64
667%endmacro
668
669; instr, modified-flags, undefined-flags
670IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
671IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
672IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
673
674
675;;
676; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
677;
678; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
679; where the 64-bit accesses requires hand coding.
680;
681; All the functions takes a pointer to the destination memory operand in A0,
682; the first source register operand in A1, the second source register operand
683; in A2 and a pointer to eflags in A3.
684;
685; @param 1 The instruction mnemonic.
686; @param 2 Fallback instruction if applicable.
687; @param 3 Whether to emit fallback or not.
688;
689%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
691 PROLOGUE_3_ARGS
692 %1 T0_32, A1_32, A2_32
693 mov [A0], T0_32
694 EPILOGUE_3_ARGS
695ENDPROC iemAImpl_ %+ %1 %+ _u32
696
697 %if %3
698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
699 PROLOGUE_3_ARGS
700 %ifdef ASM_CALL64_GCC
701 mov cl, A2_8
702 %2 A1_32, cl
703 mov [A0], A1_32
704 %else
705 xchg A2, A0
706 %2 A1_32, cl
707 mov [A2], A1_32
708 %endif
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
711 %endif
712
713 %ifdef RT_ARCH_AMD64
714BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
715 PROLOGUE_3_ARGS
716 %1 T0, A1, A2
717 mov [A0], T0
718 EPILOGUE_3_ARGS
719ENDPROC iemAImpl_ %+ %1 %+ _u64
720
721 %if %3
722BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
723 PROLOGUE_3_ARGS
724 %ifdef ASM_CALL64_GCC
725 mov cl, A2_8
726 %2 A1, cl
727 mov [A0], A1_32
728 %else
729 xchg A2, A0
730 %2 A1, cl
731 mov [A2], A1_32
732 %endif
733 mov [A0], A1
734 EPILOGUE_3_ARGS
735ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
736 %endif
737 %endif ; RT_ARCH_AMD64
738%endmacro
739
740; instr, fallback instr, emit fallback
741IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
742IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
743IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
744IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
745IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
746
747
748;
749; RORX uses a immediate byte for the shift count, so we only do
750; fallback implementation of that one.
751;
752BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
753 PROLOGUE_3_ARGS
754 %ifdef ASM_CALL64_GCC
755 mov cl, A2_8
756 ror A1_32, cl
757 mov [A0], A1_32
758 %else
759 xchg A2, A0
760 ror A1_32, cl
761 mov [A2], A1_32
762 %endif
763 EPILOGUE_3_ARGS
764ENDPROC iemAImpl_rorx_u32
765
766 %ifdef RT_ARCH_AMD64
767BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
768 PROLOGUE_3_ARGS
769 %ifdef ASM_CALL64_GCC
770 mov cl, A2_8
771 ror A1, cl
772 mov [A0], A1
773 %else
774 xchg A2, A0
775 ror A1, cl
776 mov [A2], A1
777 %endif
778 EPILOGUE_3_ARGS
779ENDPROC iemAImpl_rorx_u64
780 %endif ; RT_ARCH_AMD64
781
782
783;
784; MULX
785;
786BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX - prefect
790 mulx T0_32, T1_32, A3_32
791 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
792 mov [A0], T0_32
793%else
794 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
795 xchg A1, A2
796 mulx T0_32, T1_32, A3_32
797 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
798 mov [A0], T0_32
799%endif
800 EPILOGUE_4_ARGS
801ENDPROC iemAImpl_mulx_u32
802
803
804BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
805 PROLOGUE_4_ARGS
806%ifdef ASM_CALL64_GCC
807 ; A2_32 is EDX, T0_32 is EAX
808 mov eax, A3_32
809 mul A2_32
810 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%else
813 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
814 xchg A1, A2
815 mov eax, A3_32
816 mul A2_32
817 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], edx
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u32_fallback
822
823%ifdef RT_ARCH_AMD64
824BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX - prefect
828 mulx T0, T1, A3
829 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
830 mov [A0], T0
831%else
832 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
833 xchg A1, A2
834 mulx T0, T1, A3
835 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
836 mov [A0], T0
837%endif
838 EPILOGUE_4_ARGS
839ENDPROC iemAImpl_mulx_u64
840
841
842BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
843 PROLOGUE_4_ARGS
844%ifdef ASM_CALL64_GCC
845 ; A2 is RDX, T0 is RAX
846 mov rax, A3
847 mul A2
848 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%else
851 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
852 xchg A1, A2
853 mov rax, A3
854 mul A2
855 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
856 mov [A0], rdx
857%endif
858 EPILOGUE_4_ARGS
859ENDPROC iemAImpl_mulx_u64_fallback
860
861%endif
862
863
864;;
865; Macro for implementing a bit operator.
866;
867; This will generate code for the 16, 32 and 64 bit accesses with locked
868; variants, except on 32-bit system where the 64-bit accesses requires hand
869; coding.
870;
871; All the functions takes a pointer to the destination memory operand in A0,
872; the source register operand in A1 and a pointer to eflags in A2.
873;
874; @param 1 The instruction mnemonic.
875; @param 2 Non-zero if there should be a locked version.
876; @param 3 The modified flags.
877; @param 4 The undefined flags.
878;
879%macro IEMIMPL_BIT_OP 4
880BEGINCODE
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 word [A0], A1_16
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS
887ENDPROC iemAImpl_ %+ %1 %+ _u16
888
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 dword [A0], A1_32
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS
895ENDPROC iemAImpl_ %+ %1 %+ _u32
896
897 %ifdef RT_ARCH_AMD64
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 %1 qword [A0], A1
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS_EX 8
904ENDPROC iemAImpl_ %+ %1 %+ _u64
905 %endif ; RT_ARCH_AMD64
906
907 %if %2 != 0 ; locked versions requested?
908
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 word [A0], A1_16
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS
915ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
916
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 dword [A0], A1_32
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS
923ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
924
925 %ifdef RT_ARCH_AMD64
926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
927 PROLOGUE_3_ARGS
928 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
929 lock %1 qword [A0], A1
930 IEM_SAVE_FLAGS A2, %3, %4
931 EPILOGUE_3_ARGS_EX 8
932ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
933 %endif ; RT_ARCH_AMD64
934 %endif ; locked
935%endmacro
936IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
937IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
938IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
939IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
940
941;;
942; Macro for implementing a bit search operator.
943;
944; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
945; system where the 64-bit accesses requires hand coding.
946;
947; All the functions takes a pointer to the destination memory operand in A0,
948; the source register operand in A1 and a pointer to eflags in A2.
949;
950; In the ZF case the destination register is 'undefined', however it seems that
951; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
952; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
953; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
954; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
955;
956; @param 1 The instruction mnemonic.
957; @param 2 The modified flags.
958; @param 3 The undefined flags.
959; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
960;
961%macro IEMIMPL_BIT_OP2 4
962BEGINCODE
963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
964 PROLOGUE_3_ARGS
965 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
966 %1 T0_16, A1_16
967%if %4 != 0
968 jz .unchanged_dst
969%endif
970 mov [A0], T0_16
971.unchanged_dst:
972 IEM_SAVE_FLAGS A2, %2, %3
973 EPILOGUE_3_ARGS
974ENDPROC iemAImpl_ %+ %1 %+ _u16
975
976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
977 PROLOGUE_3_ARGS
978 %1 T1_16, A1_16
979%if %4 != 0
980 jz .unchanged_dst
981%endif
982 mov [A0], T1_16
983 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
984 EPILOGUE_3_ARGS
985.unchanged_dst:
986 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
987 EPILOGUE_3_ARGS
988ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
991 PROLOGUE_3_ARGS
992 %1 T0_16, A1_16
993%if %4 != 0
994 jz .unchanged_dst
995%endif
996 mov [A0], T0_16
997.unchanged_dst:
998 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
999 EPILOGUE_3_ARGS
1000ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1001
1002
1003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1004 PROLOGUE_3_ARGS
1005 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1006 %1 T0_32, A1_32
1007%if %4 != 0
1008 jz .unchanged_dst
1009%endif
1010 mov [A0], T0_32
1011.unchanged_dst:
1012 IEM_SAVE_FLAGS A2, %2, %3
1013 EPILOGUE_3_ARGS
1014ENDPROC iemAImpl_ %+ %1 %+ _u32
1015
1016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1017 PROLOGUE_3_ARGS
1018 %1 T1_32, A1_32
1019%if %4 != 0
1020 jz .unchanged_dst
1021%endif
1022 mov [A0], T1_32
1023 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1024 EPILOGUE_3_ARGS
1025.unchanged_dst:
1026 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1027 EPILOGUE_3_ARGS
1028ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1029
1030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1031 PROLOGUE_3_ARGS
1032 %1 T0_32, A1_32
1033%if %4 != 0
1034 jz .unchanged_dst
1035%endif
1036 mov [A0], T0_32
1037.unchanged_dst:
1038 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1039 EPILOGUE_3_ARGS
1040ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1041
1042
1043 %ifdef RT_ARCH_AMD64
1044
1045BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1046 PROLOGUE_3_ARGS
1047 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1048 %1 T0, A1
1049%if %4 != 0
1050 jz .unchanged_dst
1051%endif
1052 mov [A0], T0
1053.unchanged_dst:
1054 IEM_SAVE_FLAGS A2, %2, %3
1055 EPILOGUE_3_ARGS_EX 8
1056ENDPROC iemAImpl_ %+ %1 %+ _u64
1057
1058BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1059 PROLOGUE_3_ARGS
1060 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1061 %1 T1, A1
1062%if %4 != 0
1063 jz .unchanged_dst
1064%endif
1065 mov [A0], T1
1066 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1067 EPILOGUE_3_ARGS
1068.unchanged_dst:
1069 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1070 EPILOGUE_3_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1072
1073BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1074 PROLOGUE_3_ARGS
1075 %1 T0, A1
1076%if %4 != 0
1077 jz .unchanged_dst
1078%endif
1079 mov [A0], T0
1080.unchanged_dst:
1081 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1082 EPILOGUE_3_ARGS_EX 8
1083ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1084
1085 %endif ; RT_ARCH_AMD64
1086%endmacro
1087
1088IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1089IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1090IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1091IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1092
1093
1094;;
1095; Macro for implementing POPCNT.
1096;
1097; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1098; system where the 64-bit accesses requires hand coding.
1099;
1100; All the functions takes a pointer to the destination memory operand in A0,
1101; the source register operand in A1 and a pointer to eflags in A2.
1102;
1103; ASSUMES Intel and AMD set EFLAGS the same way.
1104;
1105; ASSUMES the instruction does not support memory destination.
1106;
1107; @param 1 The instruction mnemonic.
1108; @param 2 The modified flags.
1109; @param 3 The undefined flags.
1110;
1111%macro IEMIMPL_BIT_OP3 3
1112BEGINCODE
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_16, A1_16
1117 mov [A0], T0_16
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u16
1121
1122BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1123 PROLOGUE_3_ARGS
1124 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1125 %1 T0_32, A1_32
1126 mov [A0], T0_32
1127 IEM_SAVE_FLAGS A2, %2, %3
1128 EPILOGUE_3_ARGS
1129ENDPROC iemAImpl_ %+ %1 %+ _u32
1130
1131 %ifdef RT_ARCH_AMD64
1132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1133 PROLOGUE_3_ARGS
1134 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1135 %1 T0, A1
1136 mov [A0], T0
1137 IEM_SAVE_FLAGS A2, %2, %3
1138 EPILOGUE_3_ARGS_EX 8
1139ENDPROC iemAImpl_ %+ %1 %+ _u64
1140 %endif ; RT_ARCH_AMD64
1141%endmacro
1142IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1143
1144
1145;
1146; IMUL is also a similar but yet different case (no lock, no mem dst).
1147; The rDX:rAX variant of imul is handled together with mul further down.
1148;
1149BEGINCODE
1150; @param 1 EFLAGS that are modified.
1151; @param 2 Undefined EFLAGS.
1152; @param 3 Function suffix.
1153; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1154; 2 for AMD (set AF, clear PF, ZF and SF).
1155%macro IEMIMPL_IMUL_TWO 4
1156BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1159 imul A1_16, word [A0]
1160 mov [A0], A1_16
1161 %if %4 != 1
1162 IEM_SAVE_FLAGS A2, %1, %2
1163 %else
1164 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1165 %endif
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_imul_two_u16 %+ %3
1168
1169BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1170 PROLOGUE_3_ARGS
1171 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1172 imul A1_32, dword [A0]
1173 mov [A0], A1_32
1174 %if %4 != 1
1175 IEM_SAVE_FLAGS A2, %1, %2
1176 %else
1177 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1178 %endif
1179 EPILOGUE_3_ARGS
1180ENDPROC iemAImpl_imul_two_u32 %+ %3
1181
1182 %ifdef RT_ARCH_AMD64
1183BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1184 PROLOGUE_3_ARGS
1185 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1186 imul A1, qword [A0]
1187 mov [A0], A1
1188 %if %4 != 1
1189 IEM_SAVE_FLAGS A2, %1, %2
1190 %else
1191 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1192 %endif
1193 EPILOGUE_3_ARGS_EX 8
1194ENDPROC iemAImpl_imul_two_u64 %+ %3
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1198IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1199IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1200
1201
1202;
1203; XCHG for memory operands. This implies locking. No flag changes.
1204;
1205; Each function takes two arguments, first the pointer to the memory,
1206; then the pointer to the register. They all return void.
1207;
1208BEGINCODE
1209BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_8, [A1]
1212 xchg [A0], T0_8
1213 mov [A1], T0_8
1214 EPILOGUE_2_ARGS
1215ENDPROC iemAImpl_xchg_u8_locked
1216
1217BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0_16, [A1]
1220 xchg [A0], T0_16
1221 mov [A1], T0_16
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u16_locked
1224
1225BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0_32, [A1]
1228 xchg [A0], T0_32
1229 mov [A1], T0_32
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u32_locked
1232
1233%ifdef RT_ARCH_AMD64
1234BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0, [A1]
1237 xchg [A0], T0
1238 mov [A1], T0
1239 EPILOGUE_2_ARGS
1240ENDPROC iemAImpl_xchg_u64_locked
1241%endif
1242
1243; Unlocked variants for fDisregardLock mode.
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_8, [A1]
1248 mov T1_8, [A0]
1249 mov [A0], T0_8
1250 mov [A1], T1_8
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u8_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_16, [A1]
1257 mov T1_16, [A0]
1258 mov [A0], T0_16
1259 mov [A1], T1_16
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u16_unlocked
1262
1263BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1264 PROLOGUE_2_ARGS
1265 mov T0_32, [A1]
1266 mov T1_32, [A0]
1267 mov [A0], T0_32
1268 mov [A1], T1_32
1269 EPILOGUE_2_ARGS
1270ENDPROC iemAImpl_xchg_u32_unlocked
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1274 PROLOGUE_2_ARGS
1275 mov T0, [A1]
1276 mov T1, [A0]
1277 mov [A0], T0
1278 mov [A1], T1
1279 EPILOGUE_2_ARGS
1280ENDPROC iemAImpl_xchg_u64_unlocked
1281%endif
1282
1283
1284;
1285; XADD for memory operands.
1286;
1287; Each function takes three arguments, first the pointer to the
1288; memory/register, then the pointer to the register, and finally a pointer to
1289; eflags. They all return void.
1290;
1291BEGINCODE
1292BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1293 PROLOGUE_3_ARGS
1294 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 mov T0_8, [A1]
1296 xadd [A0], T0_8
1297 mov [A1], T0_8
1298 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1299 EPILOGUE_3_ARGS
1300ENDPROC iemAImpl_xadd_u8
1301
1302BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1303 PROLOGUE_3_ARGS
1304 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1305 mov T0_16, [A1]
1306 xadd [A0], T0_16
1307 mov [A1], T0_16
1308 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 EPILOGUE_3_ARGS
1310ENDPROC iemAImpl_xadd_u16
1311
1312BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0_32, [A1]
1316 xadd [A0], T0_32
1317 mov [A1], T0_32
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u32
1321
1322%ifdef RT_ARCH_AMD64
1323BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0, [A1]
1327 xadd [A0], T0
1328 mov [A1], T0
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u64
1332%endif ; RT_ARCH_AMD64
1333
1334BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1335 PROLOGUE_3_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 mov T0_8, [A1]
1338 lock xadd [A0], T0_8
1339 mov [A1], T0_8
1340 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1341 EPILOGUE_3_ARGS
1342ENDPROC iemAImpl_xadd_u8_locked
1343
1344BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1345 PROLOGUE_3_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1347 mov T0_16, [A1]
1348 lock xadd [A0], T0_16
1349 mov [A1], T0_16
1350 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 EPILOGUE_3_ARGS
1352ENDPROC iemAImpl_xadd_u16_locked
1353
1354BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0_32, [A1]
1358 lock xadd [A0], T0_32
1359 mov [A1], T0_32
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u32_locked
1363
1364%ifdef RT_ARCH_AMD64
1365BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1368 mov T0, [A1]
1369 lock xadd [A0], T0
1370 mov [A1], T0
1371 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_xadd_u64_locked
1374%endif ; RT_ARCH_AMD64
1375
1376
1377;
1378; CMPXCHG8B.
1379;
1380; These are tricky register wise, so the code is duplicated for each calling
1381; convention.
1382;
1383; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1384;
1385; C-proto:
1386; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1387; uint32_t *pEFlags));
1388;
1389; Note! Identical to iemAImpl_cmpxchg16b.
1390;
1391BEGINCODE
1392BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1393%ifdef RT_ARCH_AMD64
1394 %ifdef ASM_CALL64_MSC
1395 push rbx
1396
1397 mov r11, rdx ; pu64EaxEdx (is also T1)
1398 mov r10, rcx ; pu64Dst
1399
1400 mov ebx, [r8]
1401 mov ecx, [r8 + 4]
1402 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1403 mov eax, [r11]
1404 mov edx, [r11 + 4]
1405
1406 lock cmpxchg8b [r10]
1407
1408 mov [r11], eax
1409 mov [r11 + 4], edx
1410 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1411
1412 pop rbx
1413 ret
1414 %else
1415 push rbx
1416
1417 mov r10, rcx ; pEFlags
1418 mov r11, rdx ; pu64EbxEcx (is also T1)
1419
1420 mov ebx, [r11]
1421 mov ecx, [r11 + 4]
1422 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1423 mov eax, [rsi]
1424 mov edx, [rsi + 4]
1425
1426 lock cmpxchg8b [rdi]
1427
1428 mov [rsi], eax
1429 mov [rsi + 4], edx
1430 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1431
1432 pop rbx
1433 ret
1434
1435 %endif
1436%else
1437 push esi
1438 push edi
1439 push ebx
1440 push ebp
1441
1442 mov edi, ecx ; pu64Dst
1443 mov esi, edx ; pu64EaxEdx
1444 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1445 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1446
1447 mov ebx, [ecx]
1448 mov ecx, [ecx + 4]
1449 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1450 mov eax, [esi]
1451 mov edx, [esi + 4]
1452
1453 lock cmpxchg8b [edi]
1454
1455 mov [esi], eax
1456 mov [esi + 4], edx
1457 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1458
1459 pop ebp
1460 pop ebx
1461 pop edi
1462 pop esi
1463 ret 8
1464%endif
1465ENDPROC iemAImpl_cmpxchg8b
1466
1467BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1468 ; Lazy bird always lock prefixes cmpxchg8b.
1469 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1470ENDPROC iemAImpl_cmpxchg8b_locked
1471
1472%ifdef RT_ARCH_AMD64
1473
1474;
1475; CMPXCHG16B.
1476;
1477; These are tricky register wise, so the code is duplicated for each calling
1478; convention.
1479;
1480; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1481;
1482; C-proto:
1483; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1484; uint32_t *pEFlags));
1485;
1486; Note! Identical to iemAImpl_cmpxchg8b.
1487;
1488BEGINCODE
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1490 %ifdef ASM_CALL64_MSC
1491 push rbx
1492
1493 mov r11, rdx ; pu64RaxRdx (is also T1)
1494 mov r10, rcx ; pu64Dst
1495
1496 mov rbx, [r8]
1497 mov rcx, [r8 + 8]
1498 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1499 mov rax, [r11]
1500 mov rdx, [r11 + 8]
1501
1502 lock cmpxchg16b [r10]
1503
1504 mov [r11], rax
1505 mov [r11 + 8], rdx
1506 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1507
1508 pop rbx
1509 ret
1510 %else
1511 push rbx
1512
1513 mov r10, rcx ; pEFlags
1514 mov r11, rdx ; pu64RbxRcx (is also T1)
1515
1516 mov rbx, [r11]
1517 mov rcx, [r11 + 8]
1518 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1519 mov rax, [rsi]
1520 mov rdx, [rsi + 8]
1521
1522 lock cmpxchg16b [rdi]
1523
1524 mov [rsi], rax
1525 mov [rsi + 8], rdx
1526 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1527
1528 pop rbx
1529 ret
1530
1531 %endif
1532ENDPROC iemAImpl_cmpxchg16b
1533
1534BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1535 ; Lazy bird always lock prefixes cmpxchg16b.
1536 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1537ENDPROC iemAImpl_cmpxchg16b_locked
1538
1539%endif ; RT_ARCH_AMD64
1540
1541
1542;
1543; CMPXCHG.
1544;
1545; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1546;
1547; C-proto:
1548; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1549;
1550BEGINCODE
1551%macro IEMIMPL_CMPXCHG 2
1552BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1553 PROLOGUE_4_ARGS
1554 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1555 mov al, [A1]
1556 %1 cmpxchg [A0], A2_8
1557 mov [A1], al
1558 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1559 EPILOGUE_4_ARGS
1560ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1561
1562BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1563 PROLOGUE_4_ARGS
1564 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1565 mov ax, [A1]
1566 %1 cmpxchg [A0], A2_16
1567 mov [A1], ax
1568 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1569 EPILOGUE_4_ARGS
1570ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1571
1572BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1573 PROLOGUE_4_ARGS
1574 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1575 mov eax, [A1]
1576 %1 cmpxchg [A0], A2_32
1577 mov [A1], eax
1578 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1579 EPILOGUE_4_ARGS
1580ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1581
1582BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1583%ifdef RT_ARCH_AMD64
1584 PROLOGUE_4_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1586 mov rax, [A1]
1587 %1 cmpxchg [A0], A2
1588 mov [A1], rax
1589 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1590 EPILOGUE_4_ARGS
1591%else
1592 ;
1593 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1594 ;
1595 push esi
1596 push edi
1597 push ebx
1598 push ebp
1599
1600 mov edi, ecx ; pu64Dst
1601 mov esi, edx ; pu64Rax
1602 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1603 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1604
1605 mov ebx, [ecx]
1606 mov ecx, [ecx + 4]
1607 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1608 mov eax, [esi]
1609 mov edx, [esi + 4]
1610
1611 lock cmpxchg8b [edi]
1612
1613 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1614 jz .cmpxchg8b_not_equal
1615 cmp eax, eax ; just set the other flags.
1616.store:
1617 mov [esi], eax
1618 mov [esi + 4], edx
1619 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1620
1621 pop ebp
1622 pop ebx
1623 pop edi
1624 pop esi
1625 ret 8
1626
1627.cmpxchg8b_not_equal:
1628 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1629 jne .store
1630 cmp [esi], eax
1631 jmp .store
1632
1633%endif
1634ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1635%endmacro ; IEMIMPL_CMPXCHG
1636
1637IEMIMPL_CMPXCHG , ,
1638IEMIMPL_CMPXCHG lock, _locked
1639
1640;;
1641; Macro for implementing a unary operator.
1642;
1643; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1644; variants, except on 32-bit system where the 64-bit accesses requires hand
1645; coding.
1646;
1647; All the functions takes a pointer to the destination memory operand in A0,
1648; the source register operand in A1 and a pointer to eflags in A2.
1649;
1650; @param 1 The instruction mnemonic.
1651; @param 2 The modified flags.
1652; @param 3 The undefined flags.
1653;
1654%macro IEMIMPL_UNARY_OP 3
1655BEGINCODE
1656BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1657 PROLOGUE_2_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1659 %1 byte [A0]
1660 IEM_SAVE_FLAGS A1, %2, %3
1661 EPILOGUE_2_ARGS
1662ENDPROC iemAImpl_ %+ %1 %+ _u8
1663
1664BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1665 PROLOGUE_2_ARGS
1666 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1667 lock %1 byte [A0]
1668 IEM_SAVE_FLAGS A1, %2, %3
1669 EPILOGUE_2_ARGS
1670ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1671
1672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1673 PROLOGUE_2_ARGS
1674 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1675 %1 word [A0]
1676 IEM_SAVE_FLAGS A1, %2, %3
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_ %+ %1 %+ _u16
1679
1680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1683 lock %1 word [A0]
1684 IEM_SAVE_FLAGS A1, %2, %3
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 dword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u32
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 dword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1703
1704 %ifdef RT_ARCH_AMD64
1705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1706 PROLOGUE_2_ARGS
1707 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1708 %1 qword [A0]
1709 IEM_SAVE_FLAGS A1, %2, %3
1710 EPILOGUE_2_ARGS
1711ENDPROC iemAImpl_ %+ %1 %+ _u64
1712
1713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1714 PROLOGUE_2_ARGS
1715 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1716 lock %1 qword [A0]
1717 IEM_SAVE_FLAGS A1, %2, %3
1718 EPILOGUE_2_ARGS
1719ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1720 %endif ; RT_ARCH_AMD64
1721
1722%endmacro
1723
1724IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1725IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1726IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1727IEMIMPL_UNARY_OP not, 0, 0
1728
1729
1730;
1731; BSWAP. No flag changes.
1732;
1733; Each function takes one argument, pointer to the value to bswap
1734; (input/output). They all return void.
1735;
1736BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1737 PROLOGUE_1_ARGS
1738 mov T0_32, [A0] ; just in case any of the upper bits are used.
1739 db 66h
1740 bswap T0_32
1741 mov [A0], T0_32
1742 EPILOGUE_1_ARGS
1743ENDPROC iemAImpl_bswap_u16
1744
1745BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1746 PROLOGUE_1_ARGS
1747 mov T0_32, [A0]
1748 bswap T0_32
1749 mov [A0], T0_32
1750 EPILOGUE_1_ARGS
1751ENDPROC iemAImpl_bswap_u32
1752
1753BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1754%ifdef RT_ARCH_AMD64
1755 PROLOGUE_1_ARGS
1756 mov T0, [A0]
1757 bswap T0
1758 mov [A0], T0
1759 EPILOGUE_1_ARGS
1760%else
1761 PROLOGUE_1_ARGS
1762 mov T0, [A0]
1763 mov T1, [A0 + 4]
1764 bswap T0
1765 bswap T1
1766 mov [A0 + 4], T0
1767 mov [A0], T1
1768 EPILOGUE_1_ARGS
1769%endif
1770ENDPROC iemAImpl_bswap_u64
1771
1772
1773;;
1774; Macro for implementing a shift operation.
1775;
1776; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1777; 32-bit system where the 64-bit accesses requires hand coding.
1778;
1779; All the functions takes a pointer to the destination memory operand in A0,
1780; the shift count in A1 and a pointer to eflags in A2.
1781;
1782; @param 1 The instruction mnemonic.
1783; @param 2 The modified flags.
1784; @param 3 The undefined flags.
1785;
1786; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1787;
1788; @note the _intel and _amd variants are implemented in C.
1789;
1790%macro IEMIMPL_SHIFT_OP 3
1791BEGINCODE
1792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1793 PROLOGUE_3_ARGS
1794 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1795 %ifdef ASM_CALL64_GCC
1796 mov cl, A1_8
1797 %1 byte [A0], cl
1798 %else
1799 xchg A1, A0
1800 %1 byte [A1], cl
1801 %endif
1802 IEM_SAVE_FLAGS A2, %2, %3
1803 EPILOGUE_3_ARGS
1804ENDPROC iemAImpl_ %+ %1 %+ _u8
1805
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1807 PROLOGUE_3_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1809 %ifdef ASM_CALL64_GCC
1810 mov cl, A1_8
1811 %1 word [A0], cl
1812 %else
1813 xchg A1, A0
1814 %1 word [A1], cl
1815 %endif
1816 IEM_SAVE_FLAGS A2, %2, %3
1817 EPILOGUE_3_ARGS
1818ENDPROC iemAImpl_ %+ %1 %+ _u16
1819
1820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1821 PROLOGUE_3_ARGS
1822 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1823 %ifdef ASM_CALL64_GCC
1824 mov cl, A1_8
1825 %1 dword [A0], cl
1826 %else
1827 xchg A1, A0
1828 %1 dword [A1], cl
1829 %endif
1830 IEM_SAVE_FLAGS A2, %2, %3
1831 EPILOGUE_3_ARGS
1832ENDPROC iemAImpl_ %+ %1 %+ _u32
1833
1834 %ifdef RT_ARCH_AMD64
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1836 PROLOGUE_3_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 mov cl, A1_8
1840 %1 qword [A0], cl
1841 %else
1842 xchg A1, A0
1843 %1 qword [A1], cl
1844 %endif
1845 IEM_SAVE_FLAGS A2, %2, %3
1846 EPILOGUE_3_ARGS
1847ENDPROC iemAImpl_ %+ %1 %+ _u64
1848 %endif ; RT_ARCH_AMD64
1849
1850%endmacro
1851
1852IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1853IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1854IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1855IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1856IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1857IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1858IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1859
1860
1861;;
1862; Macro for implementing a double precision shift operation.
1863;
1864; This will generate code for the 16, 32 and 64 bit accesses, except on
1865; 32-bit system where the 64-bit accesses requires hand coding.
1866;
1867; The functions takes the destination operand (r/m) in A0, the source (reg) in
1868; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1869;
1870; @param 1 The instruction mnemonic.
1871; @param 2 The modified flags.
1872; @param 3 The undefined flags.
1873;
1874; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1875;
1876; @note the _intel and _amd variants are implemented in C.
1877;
1878%macro IEMIMPL_SHIFT_DBL_OP 3
1879BEGINCODE
1880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1881 PROLOGUE_4_ARGS
1882 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1883 %ifdef ASM_CALL64_GCC
1884 xchg A3, A2
1885 %1 [A0], A1_16, cl
1886 xchg A3, A2
1887 %else
1888 xchg A0, A2
1889 %1 [A2], A1_16, cl
1890 %endif
1891 IEM_SAVE_FLAGS A3, %2, %3
1892 EPILOGUE_4_ARGS
1893ENDPROC iemAImpl_ %+ %1 %+ _u16
1894
1895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1896 PROLOGUE_4_ARGS
1897 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1898 %ifdef ASM_CALL64_GCC
1899 xchg A3, A2
1900 %1 [A0], A1_32, cl
1901 xchg A3, A2
1902 %else
1903 xchg A0, A2
1904 %1 [A2], A1_32, cl
1905 %endif
1906 IEM_SAVE_FLAGS A3, %2, %3
1907 EPILOGUE_4_ARGS
1908ENDPROC iemAImpl_ %+ %1 %+ _u32
1909
1910 %ifdef RT_ARCH_AMD64
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1912 PROLOGUE_4_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1914 %ifdef ASM_CALL64_GCC
1915 xchg A3, A2
1916 %1 [A0], A1, cl
1917 xchg A3, A2
1918 %else
1919 xchg A0, A2
1920 %1 [A2], A1, cl
1921 %endif
1922 IEM_SAVE_FLAGS A3, %2, %3
1923 EPILOGUE_4_ARGS_EX 12
1924ENDPROC iemAImpl_ %+ %1 %+ _u64
1925 %endif ; RT_ARCH_AMD64
1926
1927%endmacro
1928
1929IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1930IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1931
1932
1933;;
1934; Macro for implementing a multiplication operations.
1935;
1936; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1937; 32-bit system where the 64-bit accesses requires hand coding.
1938;
1939; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1940; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1941; pointer to eflags in A3.
1942;
1943; The functions all return 0 so the caller can be used for div/idiv as well as
1944; for the mul/imul implementation.
1945;
1946; @param 1 The instruction mnemonic.
1947; @param 2 The modified flags.
1948; @param 3 The undefined flags.
1949; @param 4 Name suffix.
1950; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1951;
1952; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1953;
1954%macro IEMIMPL_MUL_OP 5
1955BEGINCODE
1956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1957 PROLOGUE_3_ARGS
1958 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1959 mov al, [A0]
1960 %1 A1_8
1961 mov [A0], ax
1962 %if %5 != 1
1963 IEM_SAVE_FLAGS A2, %2, %3
1964 %else
1965 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1966 %endif
1967 xor eax, eax
1968 EPILOGUE_3_ARGS
1969ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1970
1971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1972 PROLOGUE_4_ARGS
1973 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1974 mov ax, [A0]
1975 %ifdef ASM_CALL64_GCC
1976 %1 A2_16
1977 mov [A0], ax
1978 mov [A1], dx
1979 %else
1980 mov T1, A1
1981 %1 A2_16
1982 mov [A0], ax
1983 mov [T1], dx
1984 %endif
1985 %if %5 != 1
1986 IEM_SAVE_FLAGS A3, %2, %3
1987 %else
1988 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1989 %endif
1990 xor eax, eax
1991 EPILOGUE_4_ARGS
1992ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1993
1994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1995 PROLOGUE_4_ARGS
1996 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1997 mov eax, [A0]
1998 %ifdef ASM_CALL64_GCC
1999 %1 A2_32
2000 mov [A0], eax
2001 mov [A1], edx
2002 %else
2003 mov T1, A1
2004 %1 A2_32
2005 mov [A0], eax
2006 mov [T1], edx
2007 %endif
2008 %if %5 != 1
2009 IEM_SAVE_FLAGS A3, %2, %3
2010 %else
2011 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2012 %endif
2013 xor eax, eax
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2016
2017 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 mov rax, [A0]
2022 %ifdef ASM_CALL64_GCC
2023 %1 A2
2024 mov [A0], rax
2025 mov [A1], rdx
2026 %else
2027 mov T1, A1
2028 %1 A2
2029 mov [A0], rax
2030 mov [T1], rdx
2031 %endif
2032 %if %5 != 1
2033 IEM_SAVE_FLAGS A3, %2, %3
2034 %else
2035 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2036 %endif
2037 xor eax, eax
2038 EPILOGUE_4_ARGS_EX 12
2039ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2040 %endif ; !RT_ARCH_AMD64
2041
2042%endmacro
2043
2044IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2045IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2046IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2047IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2048IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2049IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2050
2051
2052BEGINCODE
2053;;
2054; Worker function for negating a 32-bit number in T1:T0
2055; @uses None (T0,T1)
2056BEGINPROC iemAImpl_negate_T0_T1_u32
2057 push 0
2058 push 0
2059 xchg T0_32, [xSP]
2060 xchg T1_32, [xSP + xCB]
2061 sub T0_32, [xSP]
2062 sbb T1_32, [xSP + xCB]
2063 add xSP, xCB*2
2064 ret
2065ENDPROC iemAImpl_negate_T0_T1_u32
2066
2067%ifdef RT_ARCH_AMD64
2068;;
2069; Worker function for negating a 64-bit number in T1:T0
2070; @uses None (T0,T1)
2071BEGINPROC iemAImpl_negate_T0_T1_u64
2072 push 0
2073 push 0
2074 xchg T0, [xSP]
2075 xchg T1, [xSP + xCB]
2076 sub T0, [xSP]
2077 sbb T1, [xSP + xCB]
2078 add xSP, xCB*2
2079 ret
2080ENDPROC iemAImpl_negate_T0_T1_u64
2081%endif
2082
2083
2084;;
2085; Macro for implementing a division operations.
2086;
2087; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2088; 32-bit system where the 64-bit accesses requires hand coding.
2089;
2090; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2091; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2092; pointer to eflags in A3.
2093;
2094; The functions all return 0 on success and -1 if a divide error should be
2095; raised by the caller.
2096;
2097; @param 1 The instruction mnemonic.
2098; @param 2 The modified flags.
2099; @param 3 The undefined flags.
2100; @param 4 1 if signed, 0 if unsigned.
2101; @param 5 Function suffix.
2102; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2103; 2 for AMD (set AF, clear PF, ZF and SF).
2104;
2105; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2106;
2107%macro IEMIMPL_DIV_OP 6
2108BEGINCODE
2109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2110 PROLOGUE_3_ARGS
2111
2112 ; div by chainsaw check.
2113 test A1_8, A1_8
2114 jz .div_zero
2115
2116 ; Overflow check - unsigned division is simple to verify, haven't
2117 ; found a simple way to check signed division yet unfortunately.
2118 %if %4 == 0
2119 cmp [A0 + 1], A1_8
2120 jae .div_overflow
2121 %else
2122 mov T0_16, [A0] ; T0 = dividend
2123 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2124 test A1_8, A1_8
2125 js .divisor_negative
2126 test T0_16, T0_16
2127 jns .both_positive
2128 neg T0_16
2129.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2130 push T0 ; Start off like unsigned below.
2131 shr T0_16, 7
2132 cmp T0_8, A1_8
2133 pop T0
2134 jb .div_no_overflow
2135 ja .div_overflow
2136 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2137 cmp T0_8, A1_8
2138 jae .div_overflow
2139 jmp .div_no_overflow
2140
2141.divisor_negative:
2142 neg A1_8
2143 test T0_16, T0_16
2144 jns .one_of_each
2145 neg T0_16
2146.both_positive: ; Same as unsigned shifted by sign indicator bit.
2147 shr T0_16, 7
2148 cmp T0_8, A1_8
2149 jae .div_overflow
2150.div_no_overflow:
2151 mov A1, T1 ; restore divisor
2152 %endif
2153
2154 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2155 mov ax, [A0]
2156 %1 A1_8
2157 mov [A0], ax
2158 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2159 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2160 %else
2161 IEM_SAVE_FLAGS A2, %2, %3
2162 %endif
2163 xor eax, eax
2164
2165.return:
2166 EPILOGUE_3_ARGS
2167
2168.div_zero:
2169.div_overflow:
2170 mov eax, -1
2171 jmp .return
2172ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2173
2174BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2175 PROLOGUE_4_ARGS
2176
2177 ; div by chainsaw check.
2178 test A2_16, A2_16
2179 jz .div_zero
2180
2181 ; Overflow check - unsigned division is simple to verify, haven't
2182 ; found a simple way to check signed division yet unfortunately.
2183 %if %4 == 0
2184 cmp [A1], A2_16
2185 jae .div_overflow
2186 %else
2187 mov T0_16, [A1]
2188 shl T0_32, 16
2189 mov T0_16, [A0] ; T0 = dividend
2190 mov T1, A2 ; T1 = divisor
2191 test T1_16, T1_16
2192 js .divisor_negative
2193 test T0_32, T0_32
2194 jns .both_positive
2195 neg T0_32
2196.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2197 push T0 ; Start off like unsigned below.
2198 shr T0_32, 15
2199 cmp T0_16, T1_16
2200 pop T0
2201 jb .div_no_overflow
2202 ja .div_overflow
2203 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2204 cmp T0_16, T1_16
2205 jae .div_overflow
2206 jmp .div_no_overflow
2207
2208.divisor_negative:
2209 neg T1_16
2210 test T0_32, T0_32
2211 jns .one_of_each
2212 neg T0_32
2213.both_positive: ; Same as unsigned shifted by sign indicator bit.
2214 shr T0_32, 15
2215 cmp T0_16, T1_16
2216 jae .div_overflow
2217.div_no_overflow:
2218 %endif
2219
2220 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2221 %ifdef ASM_CALL64_GCC
2222 mov T1, A2
2223 mov ax, [A0]
2224 mov dx, [A1]
2225 %1 T1_16
2226 mov [A0], ax
2227 mov [A1], dx
2228 %else
2229 mov T1, A1
2230 mov ax, [A0]
2231 mov dx, [T1]
2232 %1 A2_16
2233 mov [A0], ax
2234 mov [T1], dx
2235 %endif
2236 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2237 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2238 %else
2239 IEM_SAVE_FLAGS A3, %2, %3
2240 %endif
2241 xor eax, eax
2242
2243.return:
2244 EPILOGUE_4_ARGS
2245
2246.div_zero:
2247.div_overflow:
2248 mov eax, -1
2249 jmp .return
2250ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2251
2252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2253 PROLOGUE_4_ARGS
2254
2255 ; div by chainsaw check.
2256 test A2_32, A2_32
2257 jz .div_zero
2258
2259 ; Overflow check - unsigned division is simple to verify, haven't
2260 ; found a simple way to check signed division yet unfortunately.
2261 %if %4 == 0
2262 cmp [A1], A2_32
2263 jae .div_overflow
2264 %else
2265 push A2 ; save A2 so we modify it (we out of regs on x86).
2266 mov T0_32, [A0] ; T0 = dividend low
2267 mov T1_32, [A1] ; T1 = dividend high
2268 test A2_32, A2_32
2269 js .divisor_negative
2270 test T1_32, T1_32
2271 jns .both_positive
2272 call NAME(iemAImpl_negate_T0_T1_u32)
2273.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2274 push T0 ; Start off like unsigned below.
2275 shl T1_32, 1
2276 shr T0_32, 31
2277 or T1_32, T0_32
2278 cmp T1_32, A2_32
2279 pop T0
2280 jb .div_no_overflow
2281 ja .div_overflow
2282 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2283 cmp T0_32, A2_32
2284 jae .div_overflow
2285 jmp .div_no_overflow
2286
2287.divisor_negative:
2288 neg A2_32
2289 test T1_32, T1_32
2290 jns .one_of_each
2291 call NAME(iemAImpl_negate_T0_T1_u32)
2292.both_positive: ; Same as unsigned shifted by sign indicator bit.
2293 shl T1_32, 1
2294 shr T0_32, 31
2295 or T1_32, T0_32
2296 cmp T1_32, A2_32
2297 jae .div_overflow
2298.div_no_overflow:
2299 pop A2
2300 %endif
2301
2302 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2303 mov eax, [A0]
2304 %ifdef ASM_CALL64_GCC
2305 mov T1, A2
2306 mov eax, [A0]
2307 mov edx, [A1]
2308 %1 T1_32
2309 mov [A0], eax
2310 mov [A1], edx
2311 %else
2312 mov T1, A1
2313 mov eax, [A0]
2314 mov edx, [T1]
2315 %1 A2_32
2316 mov [A0], eax
2317 mov [T1], edx
2318 %endif
2319 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2320 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2321 %else
2322 IEM_SAVE_FLAGS A3, %2, %3
2323 %endif
2324 xor eax, eax
2325
2326.return:
2327 EPILOGUE_4_ARGS
2328
2329.div_overflow:
2330 %if %4 != 0
2331 pop A2
2332 %endif
2333.div_zero:
2334 mov eax, -1
2335 jmp .return
2336ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2337
2338 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2340 PROLOGUE_4_ARGS
2341
2342 test A2, A2
2343 jz .div_zero
2344 %if %4 == 0
2345 cmp [A1], A2
2346 jae .div_overflow
2347 %else
2348 push A2 ; save A2 so we modify it (we out of regs on x86).
2349 mov T0, [A0] ; T0 = dividend low
2350 mov T1, [A1] ; T1 = dividend high
2351 test A2, A2
2352 js .divisor_negative
2353 test T1, T1
2354 jns .both_positive
2355 call NAME(iemAImpl_negate_T0_T1_u64)
2356.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2357 push T0 ; Start off like unsigned below.
2358 shl T1, 1
2359 shr T0, 63
2360 or T1, T0
2361 cmp T1, A2
2362 pop T0
2363 jb .div_no_overflow
2364 ja .div_overflow
2365 mov T1, 0x7fffffffffffffff
2366 and T0, T1 ; Special case for covering (divisor - 1).
2367 cmp T0, A2
2368 jae .div_overflow
2369 jmp .div_no_overflow
2370
2371.divisor_negative:
2372 neg A2
2373 test T1, T1
2374 jns .one_of_each
2375 call NAME(iemAImpl_negate_T0_T1_u64)
2376.both_positive: ; Same as unsigned shifted by sign indicator bit.
2377 shl T1, 1
2378 shr T0, 63
2379 or T1, T0
2380 cmp T1, A2
2381 jae .div_overflow
2382.div_no_overflow:
2383 pop A2
2384 %endif
2385
2386 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2387 mov rax, [A0]
2388 %ifdef ASM_CALL64_GCC
2389 mov T1, A2
2390 mov rax, [A0]
2391 mov rdx, [A1]
2392 %1 T1
2393 mov [A0], rax
2394 mov [A1], rdx
2395 %else
2396 mov T1, A1
2397 mov rax, [A0]
2398 mov rdx, [T1]
2399 %1 A2
2400 mov [A0], rax
2401 mov [T1], rdx
2402 %endif
2403 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2404 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2405 %else
2406 IEM_SAVE_FLAGS A3, %2, %3
2407 %endif
2408 xor eax, eax
2409
2410.return:
2411 EPILOGUE_4_ARGS_EX 12
2412
2413.div_overflow:
2414 %if %4 != 0
2415 pop A2
2416 %endif
2417.div_zero:
2418 mov eax, -1
2419 jmp .return
2420ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2421 %endif ; !RT_ARCH_AMD64
2422
2423%endmacro
2424
2425IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2426IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2427IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2428IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2429IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2430IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2431
2432
2433;;
2434; Macro for implementing memory fence operation.
2435;
2436; No return value, no operands or anything.
2437;
2438; @param 1 The instruction.
2439;
2440%macro IEMIMPL_MEM_FENCE 1
2441BEGINCODE
2442BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2443 %1
2444 ret
2445ENDPROC iemAImpl_ %+ %1
2446%endmacro
2447
2448IEMIMPL_MEM_FENCE lfence
2449IEMIMPL_MEM_FENCE sfence
2450IEMIMPL_MEM_FENCE mfence
2451
2452;;
2453; Alternative for non-SSE2 host.
2454;
2455BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2456 push xAX
2457 xchg xAX, [xSP]
2458 add xSP, xCB
2459 ret
2460ENDPROC iemAImpl_alt_mem_fence
2461
2462
2463;;
2464; Initialize the FPU for the actual instruction being emulated, this means
2465; loading parts of the guest's control word and status word.
2466;
2467; @uses 24 bytes of stack. T0, T1
2468; @param 1 Expression giving the address of the FXSTATE of the guest.
2469;
2470%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2471 fnstenv [xSP]
2472
2473 ; FCW - for exception, precision and rounding control.
2474 movzx T0, word [%1 + X86FXSTATE.FCW]
2475 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2476 mov [xSP + X86FSTENV32P.FCW], T0_16
2477
2478 ; FSW - for undefined C0, C1, C2, and C3.
2479 movzx T1, word [%1 + X86FXSTATE.FSW]
2480 and T1, X86_FSW_C_MASK
2481 movzx T0, word [xSP + X86FSTENV32P.FSW]
2482 and T0, X86_FSW_TOP_MASK
2483 or T0, T1
2484 mov [xSP + X86FSTENV32P.FSW], T0_16
2485
2486 fldenv [xSP]
2487%endmacro
2488
2489
2490;;
2491; Initialize the FPU for the actual instruction being emulated, this means
2492; loading parts of the guest's control word, status word, and update the
2493; tag word for the top register if it's empty.
2494;
2495; ASSUMES actual TOP=7
2496;
2497; @uses 24 bytes of stack. T0, T1
2498; @param 1 Expression giving the address of the FXSTATE of the guest.
2499;
2500%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2501 fnstenv [xSP]
2502
2503 ; FCW - for exception, precision and rounding control.
2504 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2505 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2506 mov [xSP + X86FSTENV32P.FCW], T0_16
2507
2508 ; FSW - for undefined C0, C1, C2, and C3.
2509 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2510 and T1_32, X86_FSW_C_MASK
2511 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2512 and T0_32, X86_FSW_TOP_MASK
2513 or T0_32, T1_32
2514 mov [xSP + X86FSTENV32P.FSW], T0_16
2515
2516 ; FTW - Only for ST0 (in/out).
2517 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2518 shr T1_32, X86_FSW_TOP_SHIFT
2519 and T1_32, X86_FSW_TOP_SMASK
2520 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2521 jc %%st0_not_empty
2522 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2523%%st0_not_empty:
2524
2525 fldenv [xSP]
2526%endmacro
2527
2528
2529;;
2530; Need to move this as well somewhere better?
2531;
2532struc IEMFPURESULT
2533 .r80Result resw 5
2534 .FSW resw 1
2535endstruc
2536
2537
2538;;
2539; Need to move this as well somewhere better?
2540;
2541struc IEMFPURESULTTWO
2542 .r80Result1 resw 5
2543 .FSW resw 1
2544 .r80Result2 resw 5
2545endstruc
2546
2547
2548;
2549;---------------------- 16-bit signed integer operations ----------------------
2550;
2551
2552
2553;;
2554; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2555;
2556; @param A0 FPU context (fxsave).
2557; @param A1 Pointer to a IEMFPURESULT for the output.
2558; @param A2 Pointer to the 16-bit floating point value to convert.
2559;
2560BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2561 PROLOGUE_3_ARGS
2562 sub xSP, 20h
2563
2564 fninit
2565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2566 fild word [A2]
2567
2568 fnstsw word [A1 + IEMFPURESULT.FSW]
2569 fnclex
2570 fstp tword [A1 + IEMFPURESULT.r80Result]
2571
2572 fninit
2573 add xSP, 20h
2574 EPILOGUE_3_ARGS
2575ENDPROC iemAImpl_fild_r80_from_i16
2576
2577
2578;;
2579; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2580;
2581; @param A0 FPU context (fxsave).
2582; @param A1 Where to return the output FSW.
2583; @param A2 Where to store the 16-bit signed integer value.
2584; @param A3 Pointer to the 80-bit value.
2585;
2586BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2587 PROLOGUE_4_ARGS
2588 sub xSP, 20h
2589
2590 fninit
2591 fld tword [A3]
2592 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2593 fistp word [A2]
2594
2595 fnstsw word [A1]
2596
2597 fninit
2598 add xSP, 20h
2599 EPILOGUE_4_ARGS
2600ENDPROC iemAImpl_fist_r80_to_i16
2601
2602
2603;;
2604; Store a 80-bit floating point value (register) as a 16-bit signed integer
2605; (memory) with truncation.
2606;
2607; @param A0 FPU context (fxsave).
2608; @param A1 Where to return the output FSW.
2609; @param A2 Where to store the 16-bit signed integer value.
2610; @param A3 Pointer to the 80-bit value.
2611;
2612BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2613 PROLOGUE_4_ARGS
2614 sub xSP, 20h
2615
2616 fninit
2617 fld tword [A3]
2618 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2619 fisttp word [A2]
2620
2621 fnstsw word [A1]
2622
2623 fninit
2624 add xSP, 20h
2625 EPILOGUE_4_ARGS
2626ENDPROC iemAImpl_fistt_r80_to_i16
2627
2628
2629;;
2630; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2631;
2632; @param 1 The instruction
2633;
2634; @param A0 FPU context (fxsave).
2635; @param A1 Pointer to a IEMFPURESULT for the output.
2636; @param A2 Pointer to the 80-bit value.
2637; @param A3 Pointer to the 16-bit value.
2638;
2639%macro IEMIMPL_FPU_R80_BY_I16 1
2640BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2641 PROLOGUE_4_ARGS
2642 sub xSP, 20h
2643
2644 fninit
2645 fld tword [A2]
2646 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2647 %1 word [A3]
2648
2649 fnstsw word [A1 + IEMFPURESULT.FSW]
2650 fnclex
2651 fstp tword [A1 + IEMFPURESULT.r80Result]
2652
2653 fninit
2654 add xSP, 20h
2655 EPILOGUE_4_ARGS
2656ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2657%endmacro
2658
2659IEMIMPL_FPU_R80_BY_I16 fiadd
2660IEMIMPL_FPU_R80_BY_I16 fimul
2661IEMIMPL_FPU_R80_BY_I16 fisub
2662IEMIMPL_FPU_R80_BY_I16 fisubr
2663IEMIMPL_FPU_R80_BY_I16 fidiv
2664IEMIMPL_FPU_R80_BY_I16 fidivr
2665
2666
2667;;
2668; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2669; only returning FSW.
2670;
2671; @param 1 The instruction
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Where to store the output FSW.
2675; @param A2 Pointer to the 80-bit value.
2676; @param A3 Pointer to the 64-bit value.
2677;
2678%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2680 PROLOGUE_4_ARGS
2681 sub xSP, 20h
2682
2683 fninit
2684 fld tword [A2]
2685 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2686 %1 word [A3]
2687
2688 fnstsw word [A1]
2689
2690 fninit
2691 add xSP, 20h
2692 EPILOGUE_4_ARGS
2693ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2694%endmacro
2695
2696IEMIMPL_FPU_R80_BY_I16_FSW ficom
2697
2698
2699
2700;
2701;---------------------- 32-bit signed integer operations ----------------------
2702;
2703
2704
2705;;
2706; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Pointer to a IEMFPURESULT for the output.
2710; @param A2 Pointer to the 32-bit floating point value to convert.
2711;
2712BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2713 PROLOGUE_3_ARGS
2714 sub xSP, 20h
2715
2716 fninit
2717 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2718 fild dword [A2]
2719
2720 fnstsw word [A1 + IEMFPURESULT.FSW]
2721 fnclex
2722 fstp tword [A1 + IEMFPURESULT.r80Result]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_3_ARGS
2727ENDPROC iemAImpl_fild_r80_from_i32
2728
2729
2730;;
2731; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2732;
2733; @param A0 FPU context (fxsave).
2734; @param A1 Where to return the output FSW.
2735; @param A2 Where to store the 32-bit signed integer value.
2736; @param A3 Pointer to the 80-bit value.
2737;
2738BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2739 PROLOGUE_4_ARGS
2740 sub xSP, 20h
2741
2742 fninit
2743 fld tword [A3]
2744 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2745 fistp dword [A2]
2746
2747 fnstsw word [A1]
2748
2749 fninit
2750 add xSP, 20h
2751 EPILOGUE_4_ARGS
2752ENDPROC iemAImpl_fist_r80_to_i32
2753
2754
2755;;
2756; Store a 80-bit floating point value (register) as a 32-bit signed integer
2757; (memory) with truncation.
2758;
2759; @param A0 FPU context (fxsave).
2760; @param A1 Where to return the output FSW.
2761; @param A2 Where to store the 32-bit signed integer value.
2762; @param A3 Pointer to the 80-bit value.
2763;
2764BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2765 PROLOGUE_4_ARGS
2766 sub xSP, 20h
2767
2768 fninit
2769 fld tword [A3]
2770 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2771 fisttp dword [A2]
2772
2773 fnstsw word [A1]
2774
2775 fninit
2776 add xSP, 20h
2777 EPILOGUE_4_ARGS
2778ENDPROC iemAImpl_fistt_r80_to_i32
2779
2780
2781;;
2782; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2783;
2784; @param 1 The instruction
2785;
2786; @param A0 FPU context (fxsave).
2787; @param A1 Pointer to a IEMFPURESULT for the output.
2788; @param A2 Pointer to the 80-bit value.
2789; @param A3 Pointer to the 32-bit value.
2790;
2791%macro IEMIMPL_FPU_R80_BY_I32 1
2792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2793 PROLOGUE_4_ARGS
2794 sub xSP, 20h
2795
2796 fninit
2797 fld tword [A2]
2798 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2799 %1 dword [A3]
2800
2801 fnstsw word [A1 + IEMFPURESULT.FSW]
2802 fnclex
2803 fstp tword [A1 + IEMFPURESULT.r80Result]
2804
2805 fninit
2806 add xSP, 20h
2807 EPILOGUE_4_ARGS
2808ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2809%endmacro
2810
2811IEMIMPL_FPU_R80_BY_I32 fiadd
2812IEMIMPL_FPU_R80_BY_I32 fimul
2813IEMIMPL_FPU_R80_BY_I32 fisub
2814IEMIMPL_FPU_R80_BY_I32 fisubr
2815IEMIMPL_FPU_R80_BY_I32 fidiv
2816IEMIMPL_FPU_R80_BY_I32 fidivr
2817
2818
2819;;
2820; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2821; only returning FSW.
2822;
2823; @param 1 The instruction
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Where to store the output FSW.
2827; @param A2 Pointer to the 80-bit value.
2828; @param A3 Pointer to the 64-bit value.
2829;
2830%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2831BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2832 PROLOGUE_4_ARGS
2833 sub xSP, 20h
2834
2835 fninit
2836 fld tword [A2]
2837 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2838 %1 dword [A3]
2839
2840 fnstsw word [A1]
2841
2842 fninit
2843 add xSP, 20h
2844 EPILOGUE_4_ARGS
2845ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2846%endmacro
2847
2848IEMIMPL_FPU_R80_BY_I32_FSW ficom
2849
2850
2851
2852;
2853;---------------------- 64-bit signed integer operations ----------------------
2854;
2855
2856
2857;;
2858; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Pointer to a IEMFPURESULT for the output.
2862; @param A2 Pointer to the 64-bit floating point value to convert.
2863;
2864BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2865 PROLOGUE_3_ARGS
2866 sub xSP, 20h
2867
2868 fninit
2869 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2870 fild qword [A2]
2871
2872 fnstsw word [A1 + IEMFPURESULT.FSW]
2873 fnclex
2874 fstp tword [A1 + IEMFPURESULT.r80Result]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_3_ARGS
2879ENDPROC iemAImpl_fild_r80_from_i64
2880
2881
2882;;
2883; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2884;
2885; @param A0 FPU context (fxsave).
2886; @param A1 Where to return the output FSW.
2887; @param A2 Where to store the 64-bit signed integer value.
2888; @param A3 Pointer to the 80-bit value.
2889;
2890BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2891 PROLOGUE_4_ARGS
2892 sub xSP, 20h
2893
2894 fninit
2895 fld tword [A3]
2896 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2897 fistp qword [A2]
2898
2899 fnstsw word [A1]
2900
2901 fninit
2902 add xSP, 20h
2903 EPILOGUE_4_ARGS
2904ENDPROC iemAImpl_fist_r80_to_i64
2905
2906
2907;;
2908; Store a 80-bit floating point value (register) as a 64-bit signed integer
2909; (memory) with truncation.
2910;
2911; @param A0 FPU context (fxsave).
2912; @param A1 Where to return the output FSW.
2913; @param A2 Where to store the 64-bit signed integer value.
2914; @param A3 Pointer to the 80-bit value.
2915;
2916BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2917 PROLOGUE_4_ARGS
2918 sub xSP, 20h
2919
2920 fninit
2921 fld tword [A3]
2922 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2923 fisttp qword [A2]
2924
2925 fnstsw word [A1]
2926
2927 fninit
2928 add xSP, 20h
2929 EPILOGUE_4_ARGS
2930ENDPROC iemAImpl_fistt_r80_to_i64
2931
2932
2933
2934;
2935;---------------------- 32-bit floating point operations ----------------------
2936;
2937
2938;;
2939; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Pointer to a IEMFPURESULT for the output.
2943; @param A2 Pointer to the 32-bit floating point value to convert.
2944;
2945BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2946 PROLOGUE_3_ARGS
2947 sub xSP, 20h
2948
2949 fninit
2950 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2951 fld dword [A2]
2952
2953 fnstsw word [A1 + IEMFPURESULT.FSW]
2954 fnclex
2955 fstp tword [A1 + IEMFPURESULT.r80Result]
2956
2957 fninit
2958 add xSP, 20h
2959 EPILOGUE_3_ARGS
2960ENDPROC iemAImpl_fld_r80_from_r32
2961
2962
2963;;
2964; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2965;
2966; @param A0 FPU context (fxsave).
2967; @param A1 Where to return the output FSW.
2968; @param A2 Where to store the 32-bit value.
2969; @param A3 Pointer to the 80-bit value.
2970;
2971BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2972 PROLOGUE_4_ARGS
2973 sub xSP, 20h
2974
2975 fninit
2976 fld tword [A3]
2977 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2978 fst dword [A2]
2979
2980 fnstsw word [A1]
2981
2982 fninit
2983 add xSP, 20h
2984 EPILOGUE_4_ARGS
2985ENDPROC iemAImpl_fst_r80_to_r32
2986
2987
2988;;
2989; FPU instruction working on one 80-bit and one 32-bit floating point value.
2990;
2991; @param 1 The instruction
2992;
2993; @param A0 FPU context (fxsave).
2994; @param A1 Pointer to a IEMFPURESULT for the output.
2995; @param A2 Pointer to the 80-bit value.
2996; @param A3 Pointer to the 32-bit value.
2997;
2998%macro IEMIMPL_FPU_R80_BY_R32 1
2999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3000 PROLOGUE_4_ARGS
3001 sub xSP, 20h
3002
3003 fninit
3004 fld tword [A2]
3005 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3006 %1 dword [A3]
3007
3008 fnstsw word [A1 + IEMFPURESULT.FSW]
3009 fnclex
3010 fstp tword [A1 + IEMFPURESULT.r80Result]
3011
3012 fninit
3013 add xSP, 20h
3014 EPILOGUE_4_ARGS
3015ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3016%endmacro
3017
3018IEMIMPL_FPU_R80_BY_R32 fadd
3019IEMIMPL_FPU_R80_BY_R32 fmul
3020IEMIMPL_FPU_R80_BY_R32 fsub
3021IEMIMPL_FPU_R80_BY_R32 fsubr
3022IEMIMPL_FPU_R80_BY_R32 fdiv
3023IEMIMPL_FPU_R80_BY_R32 fdivr
3024
3025
3026;;
3027; FPU instruction working on one 80-bit and one 32-bit floating point value,
3028; only returning FSW.
3029;
3030; @param 1 The instruction
3031;
3032; @param A0 FPU context (fxsave).
3033; @param A1 Where to store the output FSW.
3034; @param A2 Pointer to the 80-bit value.
3035; @param A3 Pointer to the 64-bit value.
3036;
3037%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3039 PROLOGUE_4_ARGS
3040 sub xSP, 20h
3041
3042 fninit
3043 fld tword [A2]
3044 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3045 %1 dword [A3]
3046
3047 fnstsw word [A1]
3048
3049 fninit
3050 add xSP, 20h
3051 EPILOGUE_4_ARGS
3052ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3053%endmacro
3054
3055IEMIMPL_FPU_R80_BY_R32_FSW fcom
3056
3057
3058
3059;
3060;---------------------- 64-bit floating point operations ----------------------
3061;
3062
3063;;
3064; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3065;
3066; @param A0 FPU context (fxsave).
3067; @param A1 Pointer to a IEMFPURESULT for the output.
3068; @param A2 Pointer to the 64-bit floating point value to convert.
3069;
3070BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3071 PROLOGUE_3_ARGS
3072 sub xSP, 20h
3073
3074 fninit
3075 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3076 fld qword [A2]
3077
3078 fnstsw word [A1 + IEMFPURESULT.FSW]
3079 fnclex
3080 fstp tword [A1 + IEMFPURESULT.r80Result]
3081
3082 fninit
3083 add xSP, 20h
3084 EPILOGUE_3_ARGS
3085ENDPROC iemAImpl_fld_r80_from_r64
3086
3087
3088;;
3089; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3090;
3091; @param A0 FPU context (fxsave).
3092; @param A1 Where to return the output FSW.
3093; @param A2 Where to store the 64-bit value.
3094; @param A3 Pointer to the 80-bit value.
3095;
3096BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3097 PROLOGUE_4_ARGS
3098 sub xSP, 20h
3099
3100 fninit
3101 fld tword [A3]
3102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3103 fst qword [A2]
3104
3105 fnstsw word [A1]
3106
3107 fninit
3108 add xSP, 20h
3109 EPILOGUE_4_ARGS
3110ENDPROC iemAImpl_fst_r80_to_r64
3111
3112
3113;;
3114; FPU instruction working on one 80-bit and one 64-bit floating point value.
3115;
3116; @param 1 The instruction
3117;
3118; @param A0 FPU context (fxsave).
3119; @param A1 Pointer to a IEMFPURESULT for the output.
3120; @param A2 Pointer to the 80-bit value.
3121; @param A3 Pointer to the 64-bit value.
3122;
3123%macro IEMIMPL_FPU_R80_BY_R64 1
3124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3125 PROLOGUE_4_ARGS
3126 sub xSP, 20h
3127
3128 fninit
3129 fld tword [A2]
3130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3131 %1 qword [A3]
3132
3133 fnstsw word [A1 + IEMFPURESULT.FSW]
3134 fnclex
3135 fstp tword [A1 + IEMFPURESULT.r80Result]
3136
3137 fninit
3138 add xSP, 20h
3139 EPILOGUE_4_ARGS
3140ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3141%endmacro
3142
3143IEMIMPL_FPU_R80_BY_R64 fadd
3144IEMIMPL_FPU_R80_BY_R64 fmul
3145IEMIMPL_FPU_R80_BY_R64 fsub
3146IEMIMPL_FPU_R80_BY_R64 fsubr
3147IEMIMPL_FPU_R80_BY_R64 fdiv
3148IEMIMPL_FPU_R80_BY_R64 fdivr
3149
3150;;
3151; FPU instruction working on one 80-bit and one 64-bit floating point value,
3152; only returning FSW.
3153;
3154; @param 1 The instruction
3155;
3156; @param A0 FPU context (fxsave).
3157; @param A1 Where to store the output FSW.
3158; @param A2 Pointer to the 80-bit value.
3159; @param A3 Pointer to the 64-bit value.
3160;
3161%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3163 PROLOGUE_4_ARGS
3164 sub xSP, 20h
3165
3166 fninit
3167 fld tword [A2]
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 %1 qword [A3]
3170
3171 fnstsw word [A1]
3172
3173 fninit
3174 add xSP, 20h
3175 EPILOGUE_4_ARGS
3176ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3177%endmacro
3178
3179IEMIMPL_FPU_R80_BY_R64_FSW fcom
3180
3181
3182
3183;
3184;---------------------- 80-bit floating point operations ----------------------
3185;
3186
3187;;
3188; Loads a 80-bit floating point register value from memory.
3189;
3190; @param A0 FPU context (fxsave).
3191; @param A1 Pointer to a IEMFPURESULT for the output.
3192; @param A2 Pointer to the 80-bit floating point value to load.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3195 PROLOGUE_3_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3200 fld tword [A2]
3201
3202 fnstsw word [A1 + IEMFPURESULT.FSW]
3203 fnclex
3204 fstp tword [A1 + IEMFPURESULT.r80Result]
3205
3206 fninit
3207 add xSP, 20h
3208 EPILOGUE_3_ARGS
3209ENDPROC iemAImpl_fld_r80_from_r80
3210
3211
3212;;
3213; Store a 80-bit floating point register to memory
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 80-bit value.
3218; @param A3 Pointer to the 80-bit register value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fstp tword [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fst_r80_to_r80
3235
3236
3237;;
3238; Loads an 80-bit floating point register value in BCD format from memory.
3239;
3240; @param A0 FPU context (fxsave).
3241; @param A1 Pointer to a IEMFPURESULT for the output.
3242; @param A2 Pointer to the 80-bit BCD value to load.
3243;
3244BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3245 PROLOGUE_3_ARGS
3246 sub xSP, 20h
3247
3248 fninit
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 fbld tword [A2]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_3_ARGS
3259ENDPROC iemAImpl_fld_r80_from_d80
3260
3261
3262;;
3263; Store a 80-bit floating point register to memory as BCD
3264;
3265; @param A0 FPU context (fxsave).
3266; @param A1 Where to return the output FSW.
3267; @param A2 Where to store the 80-bit BCD value.
3268; @param A3 Pointer to the 80-bit register value.
3269;
3270BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3271 PROLOGUE_4_ARGS
3272 sub xSP, 20h
3273
3274 fninit
3275 fld tword [A3]
3276 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3277 fbstp tword [A2]
3278
3279 fnstsw word [A1]
3280
3281 fninit
3282 add xSP, 20h
3283 EPILOGUE_4_ARGS
3284ENDPROC iemAImpl_fst_r80_to_d80
3285
3286
3287;;
3288; FPU instruction working on two 80-bit floating point values.
3289;
3290; @param 1 The instruction
3291;
3292; @param A0 FPU context (fxsave).
3293; @param A1 Pointer to a IEMFPURESULT for the output.
3294; @param A2 Pointer to the first 80-bit value (ST0)
3295; @param A3 Pointer to the second 80-bit value (STn).
3296;
3297%macro IEMIMPL_FPU_R80_BY_R80 2
3298BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3299 PROLOGUE_4_ARGS
3300 sub xSP, 20h
3301
3302 fninit
3303 fld tword [A3]
3304 fld tword [A2]
3305 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3306 %1 %2
3307
3308 fnstsw word [A1 + IEMFPURESULT.FSW]
3309 fnclex
3310 fstp tword [A1 + IEMFPURESULT.r80Result]
3311
3312 fninit
3313 add xSP, 20h
3314 EPILOGUE_4_ARGS
3315ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3316%endmacro
3317
3318IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3319IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3320IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3321IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3322IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3323IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3324IEMIMPL_FPU_R80_BY_R80 fprem, {}
3325IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3326IEMIMPL_FPU_R80_BY_R80 fscale, {}
3327
3328
3329;;
3330; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3331; storing the result in ST1 and popping the stack.
3332;
3333; @param 1 The instruction
3334;
3335; @param A0 FPU context (fxsave).
3336; @param A1 Pointer to a IEMFPURESULT for the output.
3337; @param A2 Pointer to the first 80-bit value (ST1).
3338; @param A3 Pointer to the second 80-bit value (ST0).
3339;
3340%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A2]
3347 fld tword [A3]
3348 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3349 %1
3350
3351 fnstsw word [A1 + IEMFPURESULT.FSW]
3352 fnclex
3353 fstp tword [A1 + IEMFPURESULT.r80Result]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3362IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3363IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3364
3365
3366;;
3367; FPU instruction working on two 80-bit floating point values, only
3368; returning FSW.
3369;
3370; @param 1 The instruction
3371;
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st0, st1
3387
3388 fnstsw word [A1]
3389
3390 fninit
3391 add xSP, 20h
3392 EPILOGUE_4_ARGS
3393ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3394%endmacro
3395
3396IEMIMPL_FPU_R80_BY_R80_FSW fcom
3397IEMIMPL_FPU_R80_BY_R80_FSW fucom
3398
3399
3400;;
3401; FPU instruction working on two 80-bit floating point values,
3402; returning FSW and EFLAGS (eax).
3403;
3404; @param 1 The instruction
3405;
3406; @returns EFLAGS in EAX.
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a uint16_t for the resulting FSW.
3409; @param A2 Pointer to the first 80-bit value.
3410; @param A3 Pointer to the second 80-bit value.
3411;
3412%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3413BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3414 PROLOGUE_4_ARGS
3415 sub xSP, 20h
3416
3417 fninit
3418 fld tword [A3]
3419 fld tword [A2]
3420 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3421 %1 st1
3422
3423 fnstsw word [A1]
3424 pushf
3425 pop xAX
3426
3427 fninit
3428 add xSP, 20h
3429 EPILOGUE_4_ARGS
3430ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3431%endmacro
3432
3433IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3434IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3435
3436
3437;;
3438; FPU instruction working on one 80-bit floating point value.
3439;
3440; @param 1 The instruction
3441;
3442; @param A0 FPU context (fxsave).
3443; @param A1 Pointer to a IEMFPURESULT for the output.
3444; @param A2 Pointer to the 80-bit value.
3445;
3446%macro IEMIMPL_FPU_R80 1
3447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3448 PROLOGUE_3_ARGS
3449 sub xSP, 20h
3450
3451 fninit
3452 fld tword [A2]
3453 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3454 %1
3455
3456 fnstsw word [A1 + IEMFPURESULT.FSW]
3457 fnclex
3458 fstp tword [A1 + IEMFPURESULT.r80Result]
3459
3460 fninit
3461 add xSP, 20h
3462 EPILOGUE_3_ARGS
3463ENDPROC iemAImpl_ %+ %1 %+ _r80
3464%endmacro
3465
3466IEMIMPL_FPU_R80 fchs
3467IEMIMPL_FPU_R80 fabs
3468IEMIMPL_FPU_R80 f2xm1
3469IEMIMPL_FPU_R80 fsqrt
3470IEMIMPL_FPU_R80 frndint
3471IEMIMPL_FPU_R80 fsin
3472IEMIMPL_FPU_R80 fcos
3473
3474
3475;;
3476; FPU instruction working on one 80-bit floating point value, only
3477; returning FSW.
3478;
3479; @param 1 The instruction
3480; @param 2 Non-zero to also restore FTW.
3481;
3482; @param A0 FPU context (fxsave).
3483; @param A1 Pointer to a uint16_t for the resulting FSW.
3484; @param A2 Pointer to the 80-bit value.
3485;
3486%macro IEMIMPL_FPU_R80_FSW 2
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3488 PROLOGUE_3_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 fld tword [A2]
3493%if %2 != 0
3494 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3495%else
3496 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3497%endif
3498 %1
3499
3500 fnstsw word [A1]
3501
3502 fninit
3503 add xSP, 20h
3504 EPILOGUE_3_ARGS
3505ENDPROC iemAImpl_ %+ %1 %+ _r80
3506%endmacro
3507
3508IEMIMPL_FPU_R80_FSW ftst, 0
3509IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3510
3511
3512
3513;;
3514; FPU instruction loading a 80-bit floating point constant.
3515;
3516; @param 1 The instruction
3517;
3518; @param A0 FPU context (fxsave).
3519; @param A1 Pointer to a IEMFPURESULT for the output.
3520;
3521%macro IEMIMPL_FPU_R80_CONST 1
3522BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3523 PROLOGUE_2_ARGS
3524 sub xSP, 20h
3525
3526 fninit
3527 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3528 %1
3529
3530 fnstsw word [A1 + IEMFPURESULT.FSW]
3531 fnclex
3532 fstp tword [A1 + IEMFPURESULT.r80Result]
3533
3534 fninit
3535 add xSP, 20h
3536 EPILOGUE_2_ARGS
3537ENDPROC iemAImpl_ %+ %1 %+
3538%endmacro
3539
3540IEMIMPL_FPU_R80_CONST fld1
3541IEMIMPL_FPU_R80_CONST fldl2t
3542IEMIMPL_FPU_R80_CONST fldl2e
3543IEMIMPL_FPU_R80_CONST fldpi
3544IEMIMPL_FPU_R80_CONST fldlg2
3545IEMIMPL_FPU_R80_CONST fldln2
3546IEMIMPL_FPU_R80_CONST fldz
3547
3548
3549;;
3550; FPU instruction working on one 80-bit floating point value, outputing two.
3551;
3552; @param 1 The instruction
3553;
3554; @param A0 FPU context (fxsave).
3555; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3556; @param A2 Pointer to the 80-bit value.
3557;
3558%macro IEMIMPL_FPU_R80_R80 1
3559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3560 PROLOGUE_3_ARGS
3561 sub xSP, 20h
3562
3563 fninit
3564 fld tword [A2]
3565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3566 %1
3567
3568 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3569 fnclex
3570 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3571 fnclex
3572 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3573
3574 fninit
3575 add xSP, 20h
3576 EPILOGUE_3_ARGS
3577ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3578%endmacro
3579
3580IEMIMPL_FPU_R80_R80 fptan
3581IEMIMPL_FPU_R80_R80 fxtract
3582IEMIMPL_FPU_R80_R80 fsincos
3583
3584
3585
3586
3587;---------------------- SSE and MMX Operations ----------------------
3588
3589;; @todo what do we need to do for MMX?
3590%macro IEMIMPL_MMX_PROLOGUE 0
3591%endmacro
3592%macro IEMIMPL_MMX_EPILOGUE 0
3593%endmacro
3594
3595;; @todo what do we need to do for SSE?
3596%macro IEMIMPL_SSE_PROLOGUE 0
3597%endmacro
3598%macro IEMIMPL_SSE_EPILOGUE 0
3599%endmacro
3600
3601;; @todo what do we need to do for AVX?
3602%macro IEMIMPL_AVX_PROLOGUE 0
3603%endmacro
3604%macro IEMIMPL_AVX_EPILOGUE 0
3605%endmacro
3606
3607
3608;;
3609; Media instruction working on two full sized registers.
3610;
3611; @param 1 The instruction
3612; @param 2 Whether there is an MMX variant (1) or not (0).
3613;
3614; @param A0 FPU context (fxsave).
3615; @param A1 Pointer to the first media register size operand (input/output).
3616; @param A2 Pointer to the second media register size operand (input).
3617;
3618%macro IEMIMPL_MEDIA_F2 2
3619%if %2 != 0
3620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3621 PROLOGUE_3_ARGS
3622 IEMIMPL_MMX_PROLOGUE
3623
3624 movq mm0, [A1]
3625 movq mm1, [A2]
3626 %1 mm0, mm1
3627 movq [A1], mm0
3628
3629 IEMIMPL_MMX_EPILOGUE
3630 EPILOGUE_3_ARGS
3631ENDPROC iemAImpl_ %+ %1 %+ _u64
3632%endif
3633
3634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3635 PROLOGUE_3_ARGS
3636 IEMIMPL_SSE_PROLOGUE
3637
3638 movdqu xmm0, [A1]
3639 movdqu xmm1, [A2]
3640 %1 xmm0, xmm1
3641 movdqu [A1], xmm0
3642
3643 IEMIMPL_SSE_EPILOGUE
3644 EPILOGUE_3_ARGS
3645ENDPROC iemAImpl_ %+ %1 %+ _u128
3646%endmacro
3647
3648IEMIMPL_MEDIA_F2 pshufb, 1
3649IEMIMPL_MEDIA_F2 pand, 1
3650IEMIMPL_MEDIA_F2 pandn, 1
3651IEMIMPL_MEDIA_F2 por, 1
3652IEMIMPL_MEDIA_F2 pxor, 1
3653IEMIMPL_MEDIA_F2 pcmpeqb, 1
3654IEMIMPL_MEDIA_F2 pcmpeqw, 1
3655IEMIMPL_MEDIA_F2 pcmpeqd, 1
3656IEMIMPL_MEDIA_F2 pcmpeqq, 0
3657IEMIMPL_MEDIA_F2 pcmpgtb, 1
3658IEMIMPL_MEDIA_F2 pcmpgtw, 1
3659IEMIMPL_MEDIA_F2 pcmpgtd, 1
3660IEMIMPL_MEDIA_F2 pcmpgtq, 0
3661IEMIMPL_MEDIA_F2 paddb, 1
3662IEMIMPL_MEDIA_F2 paddw, 1
3663IEMIMPL_MEDIA_F2 paddd, 1
3664IEMIMPL_MEDIA_F2 paddq, 1
3665IEMIMPL_MEDIA_F2 paddsb, 1
3666IEMIMPL_MEDIA_F2 paddsw, 1
3667IEMIMPL_MEDIA_F2 paddusb, 1
3668IEMIMPL_MEDIA_F2 paddusw, 1
3669IEMIMPL_MEDIA_F2 psubb, 1
3670IEMIMPL_MEDIA_F2 psubw, 1
3671IEMIMPL_MEDIA_F2 psubd, 1
3672IEMIMPL_MEDIA_F2 psubq, 1
3673IEMIMPL_MEDIA_F2 psubsb, 1
3674IEMIMPL_MEDIA_F2 psubsw, 1
3675IEMIMPL_MEDIA_F2 psubusb, 1
3676IEMIMPL_MEDIA_F2 psubusw, 1
3677IEMIMPL_MEDIA_F2 pmullw, 1
3678IEMIMPL_MEDIA_F2 pmulld, 0
3679IEMIMPL_MEDIA_F2 pmulhw, 1
3680IEMIMPL_MEDIA_F2 pmaddwd, 1
3681IEMIMPL_MEDIA_F2 pminub, 1
3682IEMIMPL_MEDIA_F2 pminuw, 0
3683IEMIMPL_MEDIA_F2 pminud, 0
3684IEMIMPL_MEDIA_F2 pminsb, 0
3685IEMIMPL_MEDIA_F2 pminsw, 1
3686IEMIMPL_MEDIA_F2 pminsd, 0
3687IEMIMPL_MEDIA_F2 pmaxub, 1
3688IEMIMPL_MEDIA_F2 pmaxuw, 0
3689IEMIMPL_MEDIA_F2 pmaxud, 0
3690IEMIMPL_MEDIA_F2 pmaxsb, 0
3691IEMIMPL_MEDIA_F2 pmaxsw, 1
3692IEMIMPL_MEDIA_F2 pmaxsd, 0
3693IEMIMPL_MEDIA_F2 pabsb, 1
3694IEMIMPL_MEDIA_F2 pabsw, 1
3695IEMIMPL_MEDIA_F2 pabsd, 1
3696IEMIMPL_MEDIA_F2 psignb, 1
3697IEMIMPL_MEDIA_F2 psignw, 1
3698IEMIMPL_MEDIA_F2 psignd, 1
3699IEMIMPL_MEDIA_F2 phaddw, 1
3700IEMIMPL_MEDIA_F2 phaddd, 1
3701IEMIMPL_MEDIA_F2 phsubw, 1
3702IEMIMPL_MEDIA_F2 phsubd, 1
3703IEMIMPL_MEDIA_F2 phaddsw, 1
3704IEMIMPL_MEDIA_F2 phsubsw, 1
3705IEMIMPL_MEDIA_F2 pmaddubsw, 1
3706IEMIMPL_MEDIA_F2 pmulhrsw, 1
3707IEMIMPL_MEDIA_F2 pmuludq, 1
3708
3709
3710;;
3711; Media instruction working on two full sized registers, but no FXSAVE state argument.
3712;
3713; @param 1 The instruction
3714; @param 2 Whether there is an MMX variant (1) or not (0).
3715;
3716; @param A0 Pointer to the first media register size operand (input/output).
3717; @param A1 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_OPT_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3722 PROLOGUE_2_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A0]
3726 movq mm1, [A1]
3727 %1 mm0, mm1
3728 movq [A0], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_2_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3736 PROLOGUE_2_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A0]
3740 movdqu xmm1, [A1]
3741 %1 xmm0, xmm1
3742 movdqu [A0], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_2_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3750IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3751IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3752IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3753IEMIMPL_MEDIA_OPT_F2 psllw, 1
3754IEMIMPL_MEDIA_OPT_F2 pslld, 1
3755IEMIMPL_MEDIA_OPT_F2 psllq, 1
3756IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3757IEMIMPL_MEDIA_OPT_F2 psrld, 1
3758IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3759IEMIMPL_MEDIA_OPT_F2 psraw, 1
3760IEMIMPL_MEDIA_OPT_F2 psrad, 1
3761IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3762IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3763IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3764IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3765IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3766IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3767IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3768IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3769IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3770IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3771IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3772IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3773IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3774IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3775IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3776IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3777IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3778IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3779IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3780IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3781
3782;;
3783; Media instruction working on one full sized and one half sized register (lower half).
3784;
3785; @param 1 The instruction
3786; @param 2 1 if MMX is included, 0 if not.
3787;
3788; @param A0 Pointer to the first full sized media register operand (input/output).
3789; @param A1 Pointer to the second half sized media register operand (input).
3790;
3791%macro IEMIMPL_MEDIA_F1L1 2
3792 %if %2 != 0
3793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3794 PROLOGUE_2_ARGS
3795 IEMIMPL_MMX_PROLOGUE
3796
3797 movq mm0, [A0]
3798 movq mm1, [A1]
3799 %1 mm0, mm1
3800 movq [A0], mm0
3801
3802 IEMIMPL_MMX_EPILOGUE
3803 EPILOGUE_2_ARGS
3804ENDPROC iemAImpl_ %+ %1 %+ _u64
3805 %endif
3806
3807BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3808 PROLOGUE_2_ARGS
3809 IEMIMPL_SSE_PROLOGUE
3810
3811 movdqu xmm0, [A0]
3812 movdqu xmm1, [A1]
3813 %1 xmm0, xmm1
3814 movdqu [A0], xmm0
3815
3816 IEMIMPL_SSE_EPILOGUE
3817 EPILOGUE_2_ARGS
3818ENDPROC iemAImpl_ %+ %1 %+ _u128
3819%endmacro
3820
3821IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3822IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3823IEMIMPL_MEDIA_F1L1 punpckldq, 1
3824IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3825
3826
3827;;
3828; Media instruction working two half sized input registers (lower half) and a full sized
3829; destination register (vpunpckh*).
3830;
3831; @param 1 The instruction
3832;
3833; @param A0 Pointer to the destination register (full sized, output only).
3834; @param A1 Pointer to the first full sized media source register operand, where we
3835; will only use the lower half as input - but we'll be loading it in full.
3836; @param A2 Pointer to the second full sized media source register operand, where we
3837; will only use the lower half as input - but we'll be loading it in full.
3838;
3839%macro IEMIMPL_MEDIA_F1L1L1 1
3840BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3841 PROLOGUE_3_ARGS
3842 IEMIMPL_AVX_PROLOGUE
3843
3844 vmovdqu xmm0, [A1]
3845 vmovdqu xmm1, [A2]
3846 %1 xmm0, xmm0, xmm1
3847 vmovdqu [A0], xmm0
3848
3849 IEMIMPL_AVX_PROLOGUE
3850 EPILOGUE_3_ARGS
3851ENDPROC iemAImpl_ %+ %1 %+ _u128
3852
3853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3854 PROLOGUE_3_ARGS
3855 IEMIMPL_AVX_PROLOGUE
3856
3857 vmovdqu ymm0, [A1]
3858 vmovdqu ymm1, [A2]
3859 %1 ymm0, ymm0, ymm1
3860 vmovdqu [A0], ymm0
3861
3862 IEMIMPL_AVX_PROLOGUE
3863 EPILOGUE_3_ARGS
3864ENDPROC iemAImpl_ %+ %1 %+ _u256
3865%endmacro
3866
3867IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3868IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3869IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3870IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3871
3872
3873;;
3874; Media instruction working on one full sized and one half sized register (high half).
3875;
3876; @param 1 The instruction
3877; @param 2 1 if MMX is included, 0 if not.
3878;
3879; @param A0 Pointer to the first full sized media register operand (input/output).
3880; @param A1 Pointer to the second full sized media register operand, where we
3881; will only use the upper half as input - but we'll load it in full.
3882;
3883%macro IEMIMPL_MEDIA_F1H1 2
3884IEMIMPL_MEDIA_F1L1 %1, %2
3885%endmacro
3886
3887IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3888IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3889IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3890IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3891
3892
3893;;
3894; Media instruction working two half sized input registers (high half) and a full sized
3895; destination register (vpunpckh*).
3896;
3897; @param 1 The instruction
3898;
3899; @param A0 Pointer to the destination register (full sized, output only).
3900; @param A1 Pointer to the first full sized media source register operand, where we
3901; will only use the upper half as input - but we'll be loading it in full.
3902; @param A2 Pointer to the second full sized media source register operand, where we
3903; will only use the upper half as input - but we'll be loading it in full.
3904;
3905%macro IEMIMPL_MEDIA_F1H1H1 1
3906IEMIMPL_MEDIA_F1L1L1 %1
3907%endmacro
3908
3909IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3910IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3911IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3912IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3913
3914
3915;
3916; Shufflers with evil 8-bit immediates.
3917;
3918
3919BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3920 PROLOGUE_3_ARGS
3921 IEMIMPL_MMX_PROLOGUE
3922
3923 movq mm1, [A1]
3924 movq mm0, mm0 ; paranoia!
3925 lea T1, [.imm0 xWrtRIP]
3926 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3927 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
3928 %else
3929 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3930 %endif
3931 lea T1, [T1 + T0]
3932 IBT_NOTRACK
3933 call T1
3934 movq [A0], mm0
3935
3936 IEMIMPL_MMX_EPILOGUE
3937 EPILOGUE_3_ARGS
3938%assign bImm 0
3939%rep 256
3940.imm %+ bImm:
3941 IBT_ENDBRxx_WITHOUT_NOTRACK
3942 pshufw mm0, mm1, bImm
3943 ret
3944 %assign bImm bImm + 1
3945%endrep
3946.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
3947ENDPROC iemAImpl_pshufw_u64
3948
3949
3950%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3952 PROLOGUE_3_ARGS
3953 IEMIMPL_SSE_PROLOGUE
3954
3955 movdqu xmm1, [A1]
3956 movdqu xmm0, xmm1 ; paranoia!
3957 lea T1, [.imm0 xWrtRIP]
3958 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3959 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3960 %else
3961 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
3962 %endif
3963 lea T1, [T1 + T0*2]
3964 IBT_NOTRACK
3965 call T1
3966 movdqu [A0], xmm0
3967
3968 IEMIMPL_SSE_EPILOGUE
3969 EPILOGUE_3_ARGS
3970
3971 %assign bImm 0
3972 %rep 256
3973.imm %+ bImm:
3974 IBT_ENDBRxx_WITHOUT_NOTRACK
3975 %1 xmm0, xmm1, bImm
3976 ret
3977 %assign bImm bImm + 1
3978 %endrep
3979.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
3980ENDPROC iemAImpl_ %+ %1 %+ _u128
3981%endmacro
3982
3983IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3984IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3985IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3986
3987
3988%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3990 PROLOGUE_3_ARGS
3991 IEMIMPL_SSE_PROLOGUE
3992
3993 vmovdqu ymm1, [A1]
3994 vmovdqu ymm0, ymm1 ; paranoia!
3995 lea T1, [.imm0 xWrtRIP]
3996 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3997 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3998 %else
3999 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4000 %endif
4001 lea T1, [T1 + T0*2]
4002 IBT_NOTRACK
4003 call T1
4004 vmovdqu [A0], ymm0
4005
4006 IEMIMPL_SSE_EPILOGUE
4007 EPILOGUE_3_ARGS
4008 %assign bImm 0
4009 %rep 256
4010.imm %+ bImm:
4011 IBT_ENDBRxx_WITHOUT_NOTRACK
4012 %1 ymm0, ymm1, bImm
4013 ret
4014 %assign bImm bImm + 1
4015 %endrep
4016.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4017ENDPROC iemAImpl_ %+ %1 %+ _u256
4018%endmacro
4019
4020IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4021IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4022IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4023
4024
4025;
4026; Shifts with evil 8-bit immediates.
4027;
4028
4029%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4031 PROLOGUE_2_ARGS
4032 IEMIMPL_MMX_PROLOGUE
4033
4034 movq mm0, [A0]
4035 lea T1, [.imm0 xWrtRIP]
4036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4037 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4038 %else
4039 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4040 %endif
4041 lea T1, [T1 + T0]
4042 IBT_NOTRACK
4043 call T1
4044 movq [A0], mm0
4045
4046 IEMIMPL_MMX_EPILOGUE
4047 EPILOGUE_2_ARGS
4048%assign bImm 0
4049%rep 256
4050.imm %+ bImm:
4051 IBT_ENDBRxx_WITHOUT_NOTRACK
4052 %1 mm0, bImm
4053 ret
4054 %assign bImm bImm + 1
4055%endrep
4056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4057ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4058%endmacro
4059
4060IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4061IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4062IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4063IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4064IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4065IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4066IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4067IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4068
4069
4070%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4072 PROLOGUE_2_ARGS
4073 IEMIMPL_SSE_PROLOGUE
4074
4075 movdqu xmm0, [A0]
4076 lea T1, [.imm0 xWrtRIP]
4077 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4078 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4079 %else
4080 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4081 %endif
4082 lea T1, [T1 + T0*2]
4083 IBT_NOTRACK
4084 call T1
4085 movdqu [A0], xmm0
4086
4087 IEMIMPL_SSE_EPILOGUE
4088 EPILOGUE_2_ARGS
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4102IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4103IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4104IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4105IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4106IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4107IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4108IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4109IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4110IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4111
4112
4113;
4114; Move byte mask.
4115;
4116
4117BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4118 PROLOGUE_2_ARGS
4119 IEMIMPL_MMX_PROLOGUE
4120
4121 movq mm1, [A1]
4122 pmovmskb T0, mm1
4123 mov [A0], T0
4124%ifdef RT_ARCH_X86
4125 mov dword [A0 + 4], 0
4126%endif
4127 IEMIMPL_MMX_EPILOGUE
4128 EPILOGUE_2_ARGS
4129ENDPROC iemAImpl_pmovmskb_u64
4130
4131BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4132 PROLOGUE_2_ARGS
4133 IEMIMPL_SSE_PROLOGUE
4134
4135 movdqu xmm1, [A1]
4136 pmovmskb T0, xmm1
4137 mov [A0], T0
4138%ifdef RT_ARCH_X86
4139 mov dword [A0 + 4], 0
4140%endif
4141 IEMIMPL_SSE_EPILOGUE
4142 EPILOGUE_2_ARGS
4143ENDPROC iemAImpl_pmovmskb_u128
4144
4145BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4146 PROLOGUE_2_ARGS
4147 IEMIMPL_AVX_PROLOGUE
4148
4149 vmovdqu ymm1, [A1]
4150 vpmovmskb T0, ymm1
4151 mov [A0], T0
4152%ifdef RT_ARCH_X86
4153 mov dword [A0 + 4], 0
4154%endif
4155 IEMIMPL_AVX_EPILOGUE
4156 EPILOGUE_2_ARGS
4157ENDPROC iemAImpl_vpmovmskb_u256
4158
4159
4160;;
4161; Media instruction working on two full sized source registers and one destination (AVX).
4162;
4163; @param 1 The instruction
4164;
4165; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4166; @param A1 Pointer to the destination media register size operand (output).
4167; @param A2 Pointer to the first source media register size operand (input).
4168; @param A3 Pointer to the second source media register size operand (input).
4169;
4170%macro IEMIMPL_MEDIA_F3 1
4171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4172 PROLOGUE_4_ARGS
4173 IEMIMPL_AVX_PROLOGUE
4174
4175 vmovdqu xmm0, [A2]
4176 vmovdqu xmm1, [A3]
4177 %1 xmm0, xmm0, xmm1
4178 vmovdqu [A1], xmm0
4179
4180 IEMIMPL_AVX_PROLOGUE
4181 EPILOGUE_4_ARGS
4182ENDPROC iemAImpl_ %+ %1 %+ _u128
4183
4184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4185 PROLOGUE_4_ARGS
4186 IEMIMPL_AVX_PROLOGUE
4187
4188 vmovdqu ymm0, [A2]
4189 vmovdqu ymm1, [A3]
4190 %1 ymm0, ymm0, ymm1
4191 vmovdqu [A1], ymm0
4192
4193 IEMIMPL_AVX_PROLOGUE
4194 EPILOGUE_4_ARGS
4195ENDPROC iemAImpl_ %+ %1 %+ _u256
4196%endmacro
4197
4198IEMIMPL_MEDIA_F3 vpshufb
4199IEMIMPL_MEDIA_F3 vpand
4200IEMIMPL_MEDIA_F3 vpminub
4201IEMIMPL_MEDIA_F3 vpminuw
4202IEMIMPL_MEDIA_F3 vpminud
4203IEMIMPL_MEDIA_F3 vpminsb
4204IEMIMPL_MEDIA_F3 vpminsw
4205IEMIMPL_MEDIA_F3 vpminsd
4206IEMIMPL_MEDIA_F3 vpmaxub
4207IEMIMPL_MEDIA_F3 vpmaxuw
4208IEMIMPL_MEDIA_F3 vpmaxud
4209IEMIMPL_MEDIA_F3 vpmaxsb
4210IEMIMPL_MEDIA_F3 vpmaxsw
4211IEMIMPL_MEDIA_F3 vpmaxsd
4212IEMIMPL_MEDIA_F3 vpandn
4213IEMIMPL_MEDIA_F3 vpor
4214IEMIMPL_MEDIA_F3 vpxor
4215IEMIMPL_MEDIA_F3 vpcmpeqb
4216IEMIMPL_MEDIA_F3 vpcmpeqw
4217IEMIMPL_MEDIA_F3 vpcmpeqd
4218IEMIMPL_MEDIA_F3 vpcmpeqq
4219IEMIMPL_MEDIA_F3 vpcmpgtb
4220IEMIMPL_MEDIA_F3 vpcmpgtw
4221IEMIMPL_MEDIA_F3 vpcmpgtd
4222IEMIMPL_MEDIA_F3 vpcmpgtq
4223IEMIMPL_MEDIA_F3 vpaddb
4224IEMIMPL_MEDIA_F3 vpaddw
4225IEMIMPL_MEDIA_F3 vpaddd
4226IEMIMPL_MEDIA_F3 vpaddq
4227IEMIMPL_MEDIA_F3 vpsubb
4228IEMIMPL_MEDIA_F3 vpsubw
4229IEMIMPL_MEDIA_F3 vpsubd
4230IEMIMPL_MEDIA_F3 vpsubq
4231
4232
4233;;
4234; Media instruction working on two full sized source registers and one destination (AVX),
4235; but no XSAVE state pointer argument.
4236;
4237; @param 1 The instruction
4238;
4239; @param A0 Pointer to the destination media register size operand (output).
4240; @param A1 Pointer to the first source media register size operand (input).
4241; @param A2 Pointer to the second source media register size operand (input).
4242;
4243%macro IEMIMPL_MEDIA_OPT_F3 1
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_AVX_PROLOGUE
4247
4248 vmovdqu xmm0, [A1]
4249 vmovdqu xmm1, [A2]
4250 %1 xmm0, xmm0, xmm1
4251 vmovdqu [A0], xmm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256
4257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4258 PROLOGUE_3_ARGS
4259 IEMIMPL_AVX_PROLOGUE
4260
4261 vmovdqu ymm0, [A1]
4262 vmovdqu ymm1, [A2]
4263 %1 ymm0, ymm0, ymm1
4264 vmovdqu [A0], ymm0
4265
4266 IEMIMPL_AVX_PROLOGUE
4267 EPILOGUE_3_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269%endmacro
4270
4271IEMIMPL_MEDIA_OPT_F3 vpacksswb
4272IEMIMPL_MEDIA_OPT_F3 vpackssdw
4273IEMIMPL_MEDIA_OPT_F3 vpackuswb
4274IEMIMPL_MEDIA_OPT_F3 vpackusdw
4275IEMIMPL_MEDIA_OPT_F3 vpmullw
4276IEMIMPL_MEDIA_OPT_F3 vpmulld
4277IEMIMPL_MEDIA_OPT_F3 vpmulhw
4278IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4279IEMIMPL_MEDIA_OPT_F3 vpavgb
4280IEMIMPL_MEDIA_OPT_F3 vpavgw
4281IEMIMPL_MEDIA_OPT_F3 vpsignb
4282IEMIMPL_MEDIA_OPT_F3 vpsignw
4283IEMIMPL_MEDIA_OPT_F3 vpsignd
4284IEMIMPL_MEDIA_OPT_F3 vphaddw
4285IEMIMPL_MEDIA_OPT_F3 vphaddd
4286IEMIMPL_MEDIA_OPT_F3 vphsubw
4287IEMIMPL_MEDIA_OPT_F3 vphsubd
4288IEMIMPL_MEDIA_OPT_F3 vphaddsw
4289IEMIMPL_MEDIA_OPT_F3 vphsubsw
4290IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4291IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4292IEMIMPL_MEDIA_OPT_F3 vpsadbw
4293IEMIMPL_MEDIA_OPT_F3 vpmuldq
4294IEMIMPL_MEDIA_OPT_F3 vpmuludq
4295IEMIMPL_MEDIA_OPT_F3 vunpcklps
4296IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4297IEMIMPL_MEDIA_OPT_F3 vunpckhps
4298IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4299
4300;;
4301; Media instruction working on one full sized source registers and one destination (AVX),
4302; but no XSAVE state pointer argument.
4303;
4304; @param 1 The instruction
4305; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4306;
4307; @param A0 Pointer to the destination media register size operand (output).
4308; @param A1 Pointer to the source media register size operand (input).
4309;
4310%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4311BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4312 PROLOGUE_2_ARGS
4313 IEMIMPL_AVX_PROLOGUE
4314
4315 vmovdqu xmm0, [A1]
4316 %1 xmm0, xmm0
4317 vmovdqu [A0], xmm0
4318
4319 IEMIMPL_AVX_PROLOGUE
4320 EPILOGUE_2_ARGS
4321ENDPROC iemAImpl_ %+ %1 %+ _u128
4322
4323 %if %2 == 1
4324BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4325 PROLOGUE_2_ARGS
4326 IEMIMPL_AVX_PROLOGUE
4327
4328 vmovdqu ymm0, [A1]
4329 %1 ymm0, ymm0
4330 vmovdqu [A0], ymm0
4331
4332 IEMIMPL_AVX_PROLOGUE
4333 EPILOGUE_2_ARGS
4334ENDPROC iemAImpl_ %+ %1 %+ _u256
4335 %endif
4336%endmacro
4337
4338IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4339IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4340IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4341IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4342
4343
4344;
4345; The SSE 4.2 crc32
4346;
4347; @param A1 Pointer to the 32-bit destination.
4348; @param A2 The source operand, sized according to the suffix.
4349;
4350BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4351 PROLOGUE_2_ARGS
4352
4353 mov T0_32, [A0]
4354 crc32 T0_32, A1_8
4355 mov [A0], T0_32
4356
4357 EPILOGUE_2_ARGS
4358ENDPROC iemAImpl_crc32_u8
4359
4360BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4361 PROLOGUE_2_ARGS
4362
4363 mov T0_32, [A0]
4364 crc32 T0_32, A1_16
4365 mov [A0], T0_32
4366
4367 EPILOGUE_2_ARGS
4368ENDPROC iemAImpl_crc32_u16
4369
4370BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4371 PROLOGUE_2_ARGS
4372
4373 mov T0_32, [A0]
4374 crc32 T0_32, A1_32
4375 mov [A0], T0_32
4376
4377 EPILOGUE_2_ARGS
4378ENDPROC iemAImpl_crc32_u32
4379
4380%ifdef RT_ARCH_AMD64
4381BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4382 PROLOGUE_2_ARGS
4383
4384 mov T0_32, [A0]
4385 crc32 T0, A1
4386 mov [A0], T0_32
4387
4388 EPILOGUE_2_ARGS
4389ENDPROC iemAImpl_crc32_u64
4390%endif
4391
4392
4393;
4394; PTEST (SSE 4.1)
4395;
4396; @param A0 Pointer to the first source operand (aka readonly destination).
4397; @param A1 Pointer to the second source operand.
4398; @param A2 Pointer to the EFLAGS register.
4399;
4400BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4401 PROLOGUE_3_ARGS
4402 IEMIMPL_SSE_PROLOGUE
4403
4404 movdqu xmm0, [A0]
4405 movdqu xmm1, [A1]
4406 ptest xmm0, xmm1
4407 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4408
4409 IEMIMPL_SSE_EPILOGUE
4410 EPILOGUE_3_ARGS
4411ENDPROC iemAImpl_ptest_u128
4412
4413BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4414 PROLOGUE_3_ARGS
4415 IEMIMPL_SSE_PROLOGUE
4416
4417 vmovdqu ymm0, [A0]
4418 vmovdqu ymm1, [A1]
4419 vptest ymm0, ymm1
4420 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4421
4422 IEMIMPL_SSE_EPILOGUE
4423 EPILOGUE_3_ARGS
4424ENDPROC iemAImpl_vptest_u256
4425
4426
4427;;
4428; Template for the [v]pmov{s,z}x* instructions
4429;
4430; @param 1 The instruction
4431;
4432; @param A0 Pointer to the destination media register size operand (output).
4433; @param A1 The source operand value (input).
4434;
4435%macro IEMIMPL_V_PMOV_SZ_X 1
4436BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4437 PROLOGUE_2_ARGS
4438 IEMIMPL_SSE_PROLOGUE
4439
4440 movd xmm0, A1
4441 %1 xmm0, xmm0
4442 vmovdqu [A0], xmm0
4443
4444 IEMIMPL_SSE_PROLOGUE
4445 EPILOGUE_2_ARGS
4446ENDPROC iemAImpl_ %+ %1 %+ _u128
4447
4448BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4449 PROLOGUE_2_ARGS
4450 IEMIMPL_AVX_PROLOGUE
4451
4452 movd xmm0, A1
4453 v %+ %1 xmm0, xmm0
4454 vmovdqu [A0], xmm0
4455
4456 IEMIMPL_AVX_PROLOGUE
4457 EPILOGUE_2_ARGS
4458ENDPROC iemAImpl_v %+ %1 %+ _u128
4459
4460BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4461 PROLOGUE_2_ARGS
4462 IEMIMPL_AVX_PROLOGUE
4463
4464 movdqu xmm0, [A1]
4465 v %+ %1 ymm0, xmm0
4466 vmovdqu [A0], ymm0
4467
4468 IEMIMPL_AVX_PROLOGUE
4469 EPILOGUE_2_ARGS
4470ENDPROC iemAImpl_v %+ %1 %+ _u256
4471%endmacro
4472
4473IEMIMPL_V_PMOV_SZ_X pmovsxbw
4474IEMIMPL_V_PMOV_SZ_X pmovsxbd
4475IEMIMPL_V_PMOV_SZ_X pmovsxbq
4476IEMIMPL_V_PMOV_SZ_X pmovsxwd
4477IEMIMPL_V_PMOV_SZ_X pmovsxwq
4478IEMIMPL_V_PMOV_SZ_X pmovsxdq
4479
4480IEMIMPL_V_PMOV_SZ_X pmovzxbw
4481IEMIMPL_V_PMOV_SZ_X pmovzxbd
4482IEMIMPL_V_PMOV_SZ_X pmovzxbq
4483IEMIMPL_V_PMOV_SZ_X pmovzxwd
4484IEMIMPL_V_PMOV_SZ_X pmovzxwq
4485IEMIMPL_V_PMOV_SZ_X pmovzxdq
4486
4487
4488;;
4489; Need to move this as well somewhere better?
4490;
4491struc IEMSSERESULT
4492 .uResult resd 4
4493 .MXCSR resd 1
4494endstruc
4495
4496
4497;;
4498; Need to move this as well somewhere better?
4499;
4500struc IEMAVX128RESULT
4501 .uResult resd 4
4502 .MXCSR resd 1
4503endstruc
4504
4505
4506;;
4507; Need to move this as well somewhere better?
4508;
4509struc IEMAVX256RESULT
4510 .uResult resd 8
4511 .MXCSR resd 1
4512endstruc
4513
4514
4515;;
4516; Initialize the SSE MXCSR register using the guest value partially to
4517; account for rounding mode.
4518;
4519; @uses 4 bytes of stack to save the original value, T0.
4520; @param 1 Expression giving the address of the FXSTATE of the guest.
4521;
4522%macro SSE_LD_FXSTATE_MXCSR 1
4523 sub xSP, 4
4524
4525 stmxcsr [xSP]
4526 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4527 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4528 or T0_32, X86_MXCSR_XCPT_MASK
4529 sub xSP, 4
4530 mov [xSP], T0_32
4531 ldmxcsr [xSP]
4532 add xSP, 4
4533%endmacro
4534
4535
4536;;
4537; Restores the SSE MXCSR register with the original value.
4538;
4539; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4540; @param 1 Expression giving the address where to return the MXCSR value.
4541; @param 2 Expression giving the address of the FXSTATE of the guest.
4542;
4543; @note Restores the stack pointer.
4544;
4545%macro SSE_ST_FXSTATE_MXCSR 2
4546 sub xSP, 4
4547 stmxcsr [xSP]
4548 mov T0_32, [xSP]
4549 add xSP, 4
4550 ; Merge the status bits into the original MXCSR value.
4551 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4552 and T0_32, X86_MXCSR_XCPT_FLAGS
4553 or T0_32, T1_32
4554 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4555
4556 ldmxcsr [xSP]
4557 add xSP, 4
4558%endmacro
4559
4560
4561;;
4562; Initialize the SSE MXCSR register using the guest value partially to
4563; account for rounding mode.
4564;
4565; @uses 4 bytes of stack to save the original value.
4566; @param 1 Expression giving the address of the FXSTATE of the guest.
4567;
4568%macro AVX_LD_XSAVEAREA_MXCSR 1
4569 sub xSP, 4
4570
4571 stmxcsr [xSP]
4572 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4573 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4574 sub xSP, 4
4575 mov [xSP], T0_32
4576 ldmxcsr [xSP]
4577 add xSP, 4
4578%endmacro
4579
4580
4581;;
4582; Restores the AVX128 MXCSR register with the original value.
4583;
4584; @param 1 Expression giving the address where to return the MXCSR value.
4585;
4586; @note Restores the stack pointer.
4587;
4588%macro AVX128_ST_XSAVEAREA_MXCSR 1
4589 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4590
4591 ldmxcsr [xSP]
4592 add xSP, 4
4593%endmacro
4594
4595
4596;;
4597; Restores the AVX256 MXCSR register with the original value.
4598;
4599; @param 1 Expression giving the address where to return the MXCSR value.
4600;
4601; @note Restores the stack pointer.
4602;
4603%macro AVX256_ST_XSAVEAREA_MXCSR 1
4604 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4605
4606 ldmxcsr [xSP]
4607 add xSP, 4
4608%endmacro
4609
4610
4611;;
4612; Floating point instruction working on two full sized registers.
4613;
4614; @param 1 The instruction
4615; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4616;
4617; @param A0 FPU context (FXSTATE or XSAVEAREA).
4618; @param A1 Where to return the result including the MXCSR value.
4619; @param A2 Pointer to the first media register size operand (input/output).
4620; @param A3 Pointer to the second media register size operand (input).
4621;
4622%macro IEMIMPL_FP_F2 2
4623BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4624 PROLOGUE_4_ARGS
4625 IEMIMPL_SSE_PROLOGUE
4626 SSE_LD_FXSTATE_MXCSR A0
4627
4628 movdqu xmm0, [A2]
4629 movdqu xmm1, [A3]
4630 %1 xmm0, xmm1
4631 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4632
4633 SSE_ST_FXSTATE_MXCSR A1, A0
4634 IEMIMPL_SSE_PROLOGUE
4635 EPILOGUE_4_ARGS
4636ENDPROC iemAImpl_ %+ %1 %+ _u128
4637
4638 %if %2 == 3
4639BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4640 PROLOGUE_4_ARGS
4641 IEMIMPL_AVX_PROLOGUE
4642 AVX_LD_XSAVEAREA_MXCSR A0
4643
4644 vmovdqu xmm0, [A2]
4645 vmovdqu xmm1, [A3]
4646 v %+ %1 xmm0, xmm0, xmm1
4647 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4648
4649 AVX128_ST_XSAVEAREA_MXCSR A1
4650 IEMIMPL_AVX_PROLOGUE
4651 EPILOGUE_4_ARGS
4652ENDPROC iemAImpl_v %+ %1 %+ _u128
4653
4654BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4655 PROLOGUE_4_ARGS
4656 IEMIMPL_AVX_PROLOGUE
4657 AVX_LD_XSAVEAREA_MXCSR A0
4658
4659 vmovdqu ymm0, [A2]
4660 vmovdqu ymm1, [A3]
4661 v %+ %1 ymm0, ymm0, ymm1
4662 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4663
4664 AVX256_ST_XSAVEAREA_MXCSR A1
4665 IEMIMPL_AVX_PROLOGUE
4666 EPILOGUE_4_ARGS
4667ENDPROC iemAImpl_v %+ %1 %+ _u256
4668 %elif %2 == 2
4669BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4670 PROLOGUE_4_ARGS
4671 IEMIMPL_AVX_PROLOGUE
4672 AVX_LD_XSAVEAREA_MXCSR A0
4673
4674 vmovdqu xmm0, [A2]
4675 vmovdqu xmm1, [A3]
4676 v %+ %1 xmm0, xmm1
4677 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4678
4679 AVX128_ST_XSAVEAREA_MXCSR A1
4680 IEMIMPL_AVX_PROLOGUE
4681 EPILOGUE_4_ARGS
4682ENDPROC iemAImpl_v %+ %1 %+ _u128
4683
4684BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4685 PROLOGUE_4_ARGS
4686 IEMIMPL_AVX_PROLOGUE
4687 AVX_LD_XSAVEAREA_MXCSR A0
4688
4689 vmovdqu ymm0, [A2]
4690 vmovdqu ymm1, [A3]
4691 v %+ %1 ymm0, ymm1
4692 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4693
4694 AVX256_ST_XSAVEAREA_MXCSR A1
4695 IEMIMPL_AVX_PROLOGUE
4696 EPILOGUE_4_ARGS
4697ENDPROC iemAImpl_v %+ %1 %+ _u256
4698 %endif
4699%endmacro
4700
4701IEMIMPL_FP_F2 addps, 3
4702IEMIMPL_FP_F2 addpd, 3
4703IEMIMPL_FP_F2 mulps, 3
4704IEMIMPL_FP_F2 mulpd, 3
4705IEMIMPL_FP_F2 subps, 3
4706IEMIMPL_FP_F2 subpd, 3
4707IEMIMPL_FP_F2 minps, 3
4708IEMIMPL_FP_F2 minpd, 3
4709IEMIMPL_FP_F2 divps, 3
4710IEMIMPL_FP_F2 divpd, 3
4711IEMIMPL_FP_F2 maxps, 3
4712IEMIMPL_FP_F2 maxpd, 3
4713IEMIMPL_FP_F2 haddps, 3
4714IEMIMPL_FP_F2 haddpd, 3
4715IEMIMPL_FP_F2 hsubps, 3
4716IEMIMPL_FP_F2 hsubpd, 3
4717IEMIMPL_FP_F2 addsubps, 3
4718IEMIMPL_FP_F2 addsubpd, 3
4719
4720
4721;;
4722; These are actually unary operations but to keep it simple
4723; we treat them as binary for now, so the output result is
4724; always in sync with the register where the result might get written
4725; to.
4726IEMIMPL_FP_F2 sqrtps, 2
4727IEMIMPL_FP_F2 rsqrtps, 2
4728IEMIMPL_FP_F2 sqrtpd, 2
4729IEMIMPL_FP_F2 cvtdq2ps, 2
4730IEMIMPL_FP_F2 cvtps2dq, 2
4731IEMIMPL_FP_F2 cvttps2dq, 2
4732IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4733IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4734IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4735
4736
4737;;
4738; Floating point instruction working on a full sized register and a single precision operand.
4739;
4740; @param 1 The instruction
4741;
4742; @param A0 FPU context (FXSTATE or XSAVEAREA).
4743; @param A1 Where to return the result including the MXCSR value.
4744; @param A2 Pointer to the first media register size operand (input/output).
4745; @param A3 Pointer to the second single precision floating point value (input).
4746;
4747%macro IEMIMPL_FP_F2_R32 1
4748BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4749 PROLOGUE_4_ARGS
4750 IEMIMPL_SSE_PROLOGUE
4751 SSE_LD_FXSTATE_MXCSR A0
4752
4753 movdqu xmm0, [A2]
4754 movd xmm1, [A3]
4755 %1 xmm0, xmm1
4756 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4757
4758 SSE_ST_FXSTATE_MXCSR A1, A0
4759 IEMIMPL_SSE_EPILOGUE
4760 EPILOGUE_4_ARGS
4761ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4762
4763BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4764 PROLOGUE_4_ARGS
4765 IEMIMPL_AVX_PROLOGUE
4766 AVX_LD_XSAVEAREA_MXCSR A0
4767
4768 vmovdqu xmm0, [A2]
4769 vmovd xmm1, [A3]
4770 v %+ %1 xmm0, xmm0, xmm1
4771 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4772
4773 AVX128_ST_XSAVEAREA_MXCSR A1
4774 IEMIMPL_AVX_PROLOGUE
4775 EPILOGUE_4_ARGS
4776ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4777%endmacro
4778
4779IEMIMPL_FP_F2_R32 addss
4780IEMIMPL_FP_F2_R32 mulss
4781IEMIMPL_FP_F2_R32 subss
4782IEMIMPL_FP_F2_R32 minss
4783IEMIMPL_FP_F2_R32 divss
4784IEMIMPL_FP_F2_R32 maxss
4785IEMIMPL_FP_F2_R32 cvtss2sd
4786IEMIMPL_FP_F2_R32 sqrtss
4787IEMIMPL_FP_F2_R32 rsqrtss
4788
4789
4790;;
4791; Floating point instruction working on a full sized register and a double precision operand.
4792;
4793; @param 1 The instruction
4794;
4795; @param A0 FPU context (FXSTATE or XSAVEAREA).
4796; @param A1 Where to return the result including the MXCSR value.
4797; @param A2 Pointer to the first media register size operand (input/output).
4798; @param A3 Pointer to the second double precision floating point value (input).
4799;
4800%macro IEMIMPL_FP_F2_R64 1
4801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4802 PROLOGUE_4_ARGS
4803 IEMIMPL_SSE_PROLOGUE
4804 SSE_LD_FXSTATE_MXCSR A0
4805
4806 movdqu xmm0, [A2]
4807 movq xmm1, [A3]
4808 %1 xmm0, xmm1
4809 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4810
4811 SSE_ST_FXSTATE_MXCSR A1, A0
4812 IEMIMPL_SSE_EPILOGUE
4813 EPILOGUE_4_ARGS
4814ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4815
4816BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4817 PROLOGUE_4_ARGS
4818 IEMIMPL_AVX_PROLOGUE
4819 AVX_LD_XSAVEAREA_MXCSR A0
4820
4821 vmovdqu xmm0, [A2]
4822 vmovq xmm1, [A3]
4823 v %+ %1 xmm0, xmm0, xmm1
4824 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4825
4826 AVX128_ST_XSAVEAREA_MXCSR A1
4827 IEMIMPL_AVX_EPILOGUE
4828 EPILOGUE_4_ARGS
4829ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4830%endmacro
4831
4832IEMIMPL_FP_F2_R64 addsd
4833IEMIMPL_FP_F2_R64 mulsd
4834IEMIMPL_FP_F2_R64 subsd
4835IEMIMPL_FP_F2_R64 minsd
4836IEMIMPL_FP_F2_R64 divsd
4837IEMIMPL_FP_F2_R64 maxsd
4838IEMIMPL_FP_F2_R64 cvtsd2ss
4839IEMIMPL_FP_F2_R64 sqrtsd
4840
4841
4842;;
4843; Macro for the cvtpd2ps/cvtps2pd instructions.
4844;
4845; 1 The instruction name.
4846; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4847;
4848; @param A0 FPU context (FXSTATE or XSAVEAREA).
4849; @param A1 Where to return the result including the MXCSR value.
4850; @param A2 Pointer to the first media register size operand (input/output).
4851; @param A3 Pointer to the second media register size operand (input).
4852;
4853%macro IEMIMPL_CVT_F2 2
4854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4855 PROLOGUE_4_ARGS
4856 IEMIMPL_SSE_PROLOGUE
4857 SSE_LD_FXSTATE_MXCSR A0
4858
4859 movdqu xmm0, [A2]
4860 movdqu xmm1, [A3]
4861 %1 xmm0, xmm1
4862 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4863
4864 SSE_ST_FXSTATE_MXCSR A1, A0
4865 IEMIMPL_SSE_EPILOGUE
4866 EPILOGUE_4_ARGS
4867ENDPROC iemAImpl_ %+ %1 %+ _u128
4868
4869BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4870 PROLOGUE_4_ARGS
4871 IEMIMPL_AVX_PROLOGUE
4872 AVX_LD_XSAVEAREA_MXCSR A0
4873
4874 vmovdqu xmm0, [A2]
4875 vmovdqu xmm1, [A3]
4876 v %+ %1 xmm0, xmm1
4877 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4878
4879 AVX128_ST_XSAVEAREA_MXCSR A1
4880 IEMIMPL_AVX_EPILOGUE
4881 EPILOGUE_4_ARGS
4882ENDPROC iemAImpl_v %+ %1 %+ _u128
4883
4884BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4885 PROLOGUE_4_ARGS
4886 IEMIMPL_AVX_PROLOGUE
4887 AVX_LD_XSAVEAREA_MXCSR A0
4888
4889 vmovdqu ymm0, [A2]
4890 vmovdqu ymm1, [A3]
4891 %if %2 == 0
4892 v %+ %1 xmm0, ymm1
4893 %else
4894 v %+ %1 ymm0, xmm1
4895 %endif
4896 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4897
4898 AVX256_ST_XSAVEAREA_MXCSR A1
4899 IEMIMPL_AVX_EPILOGUE
4900 EPILOGUE_4_ARGS
4901ENDPROC iemAImpl_v %+ %1 %+ _u256
4902%endmacro
4903
4904IEMIMPL_CVT_F2 cvtpd2ps, 0
4905IEMIMPL_CVT_F2 cvtps2pd, 1
4906
4907
4908;;
4909; shufps instructions with 8-bit immediates.
4910;
4911; @param A0 Pointer to the destination media register size operand (input/output).
4912; @param A1 Pointer to the first source media register size operand (input).
4913; @param A2 The 8-bit immediate
4914;
4915BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4916 PROLOGUE_3_ARGS
4917 IEMIMPL_SSE_PROLOGUE
4918
4919 movdqu xmm0, [A0]
4920 movdqu xmm1, [A1]
4921 lea T1, [.imm0 xWrtRIP]
4922 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4923 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
4924 %else
4925 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
4926 %endif
4927 lea T1, [T1 + T0*2]
4928 IBT_NOTRACK
4929 call T1
4930 movdqu [A0], xmm0
4931
4932 IEMIMPL_SSE_EPILOGUE
4933 EPILOGUE_3_ARGS
4934 %assign bImm 0
4935 %rep 256
4936.imm %+ bImm:
4937 IBT_ENDBRxx_WITHOUT_NOTRACK
4938 shufps xmm0, xmm1, bImm
4939 ret
4940 int3
4941 %assign bImm bImm + 1
4942 %endrep
4943.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4944ENDPROC iemAImpl_shufps_u128
4945
4946
4947;;
4948; shufpd instruction with 8-bit immediates.
4949;
4950; @param A0 Pointer to the destination media register size operand (input/output).
4951; @param A1 Pointer to the first source media register size operand (input).
4952; @param A2 The 8-bit immediate
4953;
4954BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4955 PROLOGUE_3_ARGS
4956 IEMIMPL_SSE_PROLOGUE
4957
4958 movdqu xmm0, [A0]
4959 movdqu xmm1, [A1]
4960 lea T1, [.imm0 xWrtRIP]
4961 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4962 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4963 %else
4964 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4965 %endif
4966 lea T1, [T1 + T0*2]
4967 IBT_NOTRACK
4968 call T1
4969 movdqu [A0], xmm0
4970
4971 IEMIMPL_SSE_EPILOGUE
4972 EPILOGUE_3_ARGS
4973 %assign bImm 0
4974 %rep 256
4975.imm %+ bImm:
4976 IBT_ENDBRxx_WITHOUT_NOTRACK
4977 shufpd xmm0, xmm1, bImm
4978 ret
4979 %assign bImm bImm + 1
4980 %endrep
4981.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4982ENDPROC iemAImpl_shufpd_u128
4983
4984
4985;;
4986; vshufp{s,d} instructions with 8-bit immediates.
4987;
4988; @param 1 The instruction name.
4989;
4990; @param A0 Pointer to the destination media register size operand (output).
4991; @param A1 Pointer to the first source media register size operand (input).
4992; @param A2 Pointer to the second source media register size operand (input).
4993; @param A3 The 8-bit immediate
4994;
4995%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4996BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4997 PROLOGUE_4_ARGS
4998 IEMIMPL_AVX_PROLOGUE
4999
5000 movdqu xmm0, [A1]
5001 movdqu xmm1, [A2]
5002 lea T1, [.imm0 xWrtRIP]
5003 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5004 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5005 %else
5006 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5007 %endif
5008 lea T1, [T1 + T0*2]
5009 IBT_NOTRACK
5010 call T1
5011 movdqu [A0], xmm0
5012
5013 IEMIMPL_AVX_EPILOGUE
5014 EPILOGUE_4_ARGS
5015 %assign bImm 0
5016 %rep 256
5017.imm %+ bImm:
5018 IBT_ENDBRxx_WITHOUT_NOTRACK
5019 %1 xmm0, xmm0, xmm1, bImm
5020 ret
5021 %assign bImm bImm + 1
5022 %endrep
5023.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5024ENDPROC iemAImpl_ %+ %1 %+ _u128
5025
5026BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5027 PROLOGUE_4_ARGS
5028 IEMIMPL_AVX_PROLOGUE
5029
5030 vmovdqu ymm0, [A1]
5031 vmovdqu ymm1, [A2]
5032 lea T1, [.imm0 xWrtRIP]
5033 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5034 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5035 %else
5036 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5037 %endif
5038 lea T1, [T1 + T0*2]
5039 IBT_NOTRACK
5040 call T1
5041 vmovdqu [A0], ymm0
5042
5043 IEMIMPL_AVX_EPILOGUE
5044 EPILOGUE_4_ARGS
5045 %assign bImm 0
5046 %rep 256
5047.imm %+ bImm:
5048 IBT_ENDBRxx_WITHOUT_NOTRACK
5049 %1 ymm0, ymm0, ymm1, bImm
5050 ret
5051 %assign bImm bImm + 1
5052 %endrep
5053.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5054ENDPROC iemAImpl_ %+ %1 %+ _u256
5055%endmacro
5056
5057IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5058IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5059
5060
5061;;
5062; One of the [p]blendv{b,ps,pd} variants
5063;
5064; @param 1 The instruction
5065;
5066; @param A0 Pointer to the first media register sized operand (input/output).
5067; @param A1 Pointer to the second media sized value (input).
5068; @param A2 Pointer to the media register sized mask value (input).
5069;
5070%macro IEMIMPL_P_BLEND 1
5071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5072 PROLOGUE_3_ARGS
5073 IEMIMPL_SSE_PROLOGUE
5074
5075 movdqu xmm0, [A2] ; This is implicit
5076 movdqu xmm1, [A0]
5077 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5078 %1 xmm1, xmm2
5079 movdqu [A0], xmm1
5080
5081 IEMIMPL_SSE_PROLOGUE
5082 EPILOGUE_3_ARGS
5083ENDPROC iemAImpl_ %+ %1 %+ _u128
5084%endmacro
5085
5086IEMIMPL_P_BLEND pblendvb
5087IEMIMPL_P_BLEND blendvps
5088IEMIMPL_P_BLEND blendvpd
5089
5090
5091;;
5092; One of the v[p]blendv{b,ps,pd} variants
5093;
5094; @param 1 The instruction
5095;
5096; @param A0 Pointer to the first media register sized operand (output).
5097; @param A1 Pointer to the first media register sized operand (input).
5098; @param A2 Pointer to the second media register sized operand (input).
5099; @param A3 Pointer to the media register sized mask value (input).
5100%macro IEMIMPL_AVX_P_BLEND 1
5101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5102 PROLOGUE_4_ARGS
5103 IEMIMPL_AVX_PROLOGUE
5104
5105 vmovdqu xmm0, [A1]
5106 vmovdqu xmm1, [A2]
5107 vmovdqu xmm2, [A3]
5108 %1 xmm0, xmm0, xmm1, xmm2
5109 vmovdqu [A0], xmm0
5110
5111 IEMIMPL_AVX_PROLOGUE
5112 EPILOGUE_4_ARGS
5113ENDPROC iemAImpl_ %+ %1 %+ _u128
5114
5115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5116 PROLOGUE_4_ARGS
5117 IEMIMPL_AVX_PROLOGUE
5118
5119 vmovdqu ymm0, [A1]
5120 vmovdqu ymm1, [A2]
5121 vmovdqu ymm2, [A3]
5122 %1 ymm0, ymm0, ymm1, ymm2
5123 vmovdqu [A0], ymm0
5124
5125 IEMIMPL_AVX_PROLOGUE
5126 EPILOGUE_4_ARGS
5127ENDPROC iemAImpl_ %+ %1 %+ _u256
5128%endmacro
5129
5130IEMIMPL_AVX_P_BLEND vpblendvb
5131IEMIMPL_AVX_P_BLEND vblendvps
5132IEMIMPL_AVX_P_BLEND vblendvpd
5133
5134
5135;;
5136; palignr mm1, mm2/m64 instruction.
5137;
5138; @param A0 Pointer to the first media register sized operand (output).
5139; @param A1 The second register sized operand (input).
5140; @param A2 The 8-bit immediate.
5141BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5142 PROLOGUE_3_ARGS
5143 IEMIMPL_MMX_PROLOGUE
5144
5145 movq mm0, [A0]
5146 movq mm1, A1
5147 lea T1, [.imm0 xWrtRIP]
5148 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5149 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5150 %else
5151 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5152 %endif
5153 lea T1, [T1 + T0*2]
5154 IBT_NOTRACK
5155 call T1
5156 movq [A0], mm0
5157
5158 IEMIMPL_MMX_EPILOGUE
5159 EPILOGUE_3_ARGS
5160 %assign bImm 0
5161 %rep 256
5162.imm %+ bImm:
5163 IBT_ENDBRxx_WITHOUT_NOTRACK
5164 palignr mm0, mm1, bImm
5165 ret
5166 %assign bImm bImm + 1
5167 %endrep
5168.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5169ENDPROC iemAImpl_palignr_u64
5170
5171
5172;;
5173; SSE instructions with 8-bit immediates of the form
5174; xxx xmm1, xmm2, imm8.
5175; where the instruction encoding takes up 6 bytes.
5176;
5177; @param 1 The instruction name.
5178;
5179; @param A0 Pointer to the first media register size operand (input/output).
5180; @param A1 Pointer to the second source media register size operand (input).
5181; @param A2 The 8-bit immediate
5182;
5183%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5185 PROLOGUE_3_ARGS
5186 IEMIMPL_SSE_PROLOGUE
5187
5188 movdqu xmm0, [A0]
5189 movdqu xmm1, [A1]
5190 lea T1, [.imm0 xWrtRIP]
5191 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5192 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5193 lea T1, [T1 + T0*4]
5194 %else
5195 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5196 %endif
5197 IBT_NOTRACK
5198 call T1
5199 movdqu [A0], xmm0
5200
5201 IEMIMPL_SSE_EPILOGUE
5202 EPILOGUE_3_ARGS
5203 %assign bImm 0
5204 %rep 256
5205.imm %+ bImm:
5206 IBT_ENDBRxx_WITHOUT_NOTRACK
5207 %1 xmm0, xmm1, bImm
5208 ret
5209 int3
5210 %assign bImm bImm + 1
5211 %endrep
5212.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5213ENDPROC iemAImpl_ %+ %1 %+ _u128
5214%endmacro
5215
5216IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5217IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5218IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5219IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5220IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5221IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5222IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5223
5224
5225;;
5226; AVX instructions with 8-bit immediates of the form
5227; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5228; where the instruction encoding takes up 6 bytes.
5229;
5230; @param 1 The instruction name.
5231; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5232;
5233; @param A0 Pointer to the destination media register size operand (output).
5234; @param A1 Pointer to the first source media register size operand (input).
5235; @param A2 Pointer to the second source media register size operand (input).
5236; @param A3 The 8-bit immediate
5237;
5238%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5239BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5240 PROLOGUE_4_ARGS
5241 IEMIMPL_AVX_PROLOGUE
5242
5243 movdqu xmm0, [A1]
5244 movdqu xmm1, [A2]
5245 lea T1, [.imm0 xWrtRIP]
5246 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5247 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5248 lea T1, [T1 + T0*4]
5249 %else
5250 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5251 %endif
5252 IBT_NOTRACK
5253 call T1
5254 movdqu [A0], xmm0
5255
5256 IEMIMPL_AVX_EPILOGUE
5257 EPILOGUE_4_ARGS
5258 %assign bImm 0
5259 %rep 256
5260.imm %+ bImm:
5261 IBT_ENDBRxx_WITHOUT_NOTRACK
5262 %1 xmm0, xmm0, xmm1, bImm
5263 ret
5264 int3
5265 %assign bImm bImm + 1
5266 %endrep
5267.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5268ENDPROC iemAImpl_ %+ %1 %+ _u128
5269
5270 %if %2 == 1
5271BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5272 PROLOGUE_4_ARGS
5273 IEMIMPL_AVX_PROLOGUE
5274
5275 vmovdqu ymm0, [A1]
5276 vmovdqu ymm1, [A2]
5277 lea T1, [.imm0 xWrtRIP]
5278 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5279 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5280 lea T1, [T1 + T0*4]
5281 %else
5282 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5283 %endif
5284 IBT_NOTRACK
5285 call T1
5286 vmovdqu [A0], ymm0
5287
5288 IEMIMPL_AVX_EPILOGUE
5289 EPILOGUE_4_ARGS
5290 %assign bImm 0
5291 %rep 256
5292.imm %+ bImm:
5293 IBT_ENDBRxx_WITHOUT_NOTRACK
5294 %1 ymm0, ymm0, ymm1, bImm
5295 ret
5296 int3
5297 %assign bImm bImm + 1
5298 %endrep
5299.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5300ENDPROC iemAImpl_ %+ %1 %+ _u256
5301 %endif
5302%endmacro
5303
5304IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5305IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5306IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5307IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5308IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5309
5310
5311;;
5312; Need to move this as well somewhere better?
5313;
5314struc IEMPCMPISTRXSRC
5315 .uSrc1 resd 4
5316 .uSrc2 resd 4
5317endstruc
5318
5319struc IEMPCMPESTRXSRC
5320 .uSrc1 resd 4
5321 .uSrc2 resd 4
5322 .u64Rax resd 2
5323 .u64Rdx resd 2
5324endstruc
5325
5326;;
5327; The pcmpistri instruction.
5328;
5329; @param A0 Pointer to the ECX register to store the result to (output).
5330; @param A1 Pointer to the EFLAGS register.
5331; @param A2 Pointer to the structure containing the source operands (input).
5332; @param A3 The 8-bit immediate
5333;
5334BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5335 PROLOGUE_4_ARGS
5336 IEMIMPL_SSE_PROLOGUE
5337
5338 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5339 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5340 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5341 lea T1, [.imm0 xWrtRIP]
5342 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5343 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5344 lea T1, [T1 + T0*4]
5345 %else
5346 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5347 %endif
5348 IBT_NOTRACK
5349 call T1
5350
5351 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5352 mov [T2], ecx
5353
5354 IEMIMPL_SSE_EPILOGUE
5355 EPILOGUE_4_ARGS
5356 %assign bImm 0
5357 %rep 256
5358.imm %+ bImm:
5359 IBT_ENDBRxx_WITHOUT_NOTRACK
5360 pcmpistri xmm0, xmm1, bImm
5361 ret
5362 int3
5363 %assign bImm bImm + 1
5364 %endrep
5365.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5366ENDPROC iemAImpl_pcmpistri_u128
5367
5368;;
5369; The pcmpestri instruction.
5370;
5371; @param A0 Pointer to the ECX register to store the result to (output).
5372; @param A1 Pointer to the EFLAGS register.
5373; @param A2 Pointer to the structure containing the source operands (input).
5374; @param A3 The 8-bit immediate
5375;
5376BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5377 PROLOGUE_4_ARGS
5378 IEMIMPL_SSE_PROLOGUE
5379
5380 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5381 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5382 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5383 lea T1, [.imm0 xWrtRIP]
5384 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5385 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5386 lea T1, [T1 + T0*4]
5387 %else
5388 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5389 %endif
5390 push xDX ; xDX can be A1 or A2 depending on the calling convention
5391 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5392 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5393 IBT_NOTRACK
5394 call T1
5395
5396 pop xDX
5397 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5398 mov [T2], ecx
5399
5400 IEMIMPL_SSE_EPILOGUE
5401 EPILOGUE_4_ARGS
5402 %assign bImm 0
5403 %rep 256
5404.imm %+ bImm:
5405 IBT_ENDBRxx_WITHOUT_NOTRACK
5406 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5407 pcmpestri xmm0, xmm1, bImm
5408 ret
5409 %assign bImm bImm + 1
5410 %endrep
5411.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5412ENDPROC iemAImpl_pcmpestri_u128
5413
5414;;
5415; The pcmpistrm instruction template.
5416;
5417; @param A0 Pointer to the XMM0 register to store the result to (output).
5418; @param A1 Pointer to the EFLAGS register.
5419; @param A2 Pointer to the structure containing the source operands (input).
5420; @param A3 The 8-bit immediate
5421;
5422BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5423 PROLOGUE_4_ARGS
5424 IEMIMPL_SSE_PROLOGUE
5425
5426 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5427 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5428 lea T1, [.imm0 xWrtRIP]
5429 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5430 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5431 lea T1, [T1 + T0*4]
5432 %else
5433 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5434 %endif
5435 IBT_NOTRACK
5436 call T1
5437
5438 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5439 movdqu [A0], xmm0
5440
5441 IEMIMPL_SSE_EPILOGUE
5442 EPILOGUE_4_ARGS
5443 %assign bImm 0
5444 %rep 256
5445.imm %+ bImm:
5446 IBT_ENDBRxx_WITHOUT_NOTRACK
5447 pcmpistrm xmm1, xmm2, bImm
5448 ret
5449 int3
5450 %assign bImm bImm + 1
5451 %endrep
5452.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5453ENDPROC iemAImpl_pcmpistrm_u128
5454
5455;;
5456; The pcmpestrm instruction template.
5457;
5458; @param A0 Pointer to the XMM0 register to store the result to (output).
5459; @param A1 Pointer to the EFLAGS register.
5460; @param A2 Pointer to the structure containing the source operands (input).
5461; @param A3 The 8-bit immediate
5462;
5463BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5464 PROLOGUE_4_ARGS
5465 IEMIMPL_SSE_PROLOGUE
5466
5467 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5468 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5469 lea T1, [.imm0 xWrtRIP]
5470 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5471 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5472 lea T1, [T1 + T0*4]
5473 %else
5474 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5475 %endif
5476 push xDX ; xDX can be A1 or A2 depending on the calling convention
5477 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5478 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5479 IBT_NOTRACK
5480 call T1
5481
5482 pop xDX
5483 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5484 movdqu [A0], xmm0
5485
5486 IEMIMPL_SSE_EPILOGUE
5487 EPILOGUE_4_ARGS
5488 %assign bImm 0
5489 %rep 256
5490.imm %+ bImm:
5491 IBT_ENDBRxx_WITHOUT_NOTRACK
5492 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5493 pcmpestrm xmm1, xmm2, bImm
5494 ret
5495 %assign bImm bImm + 1
5496 %endrep
5497.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5498ENDPROC iemAImpl_pcmpestrm_u128
5499
5500
5501;;
5502; pinsrw instruction.
5503;
5504; @param A0 Pointer to the first media register size operand (input/output).
5505; @param A1 The 16 bit input operand (input).
5506; @param A2 The 8-bit immediate
5507;
5508BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5509 PROLOGUE_3_ARGS
5510 IEMIMPL_SSE_PROLOGUE
5511
5512 movq mm0, [A0]
5513 lea T1, [.imm0 xWrtRIP]
5514 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5515 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5516 %else
5517 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5518 %endif
5519 lea T1, [T1 + T0]
5520 IBT_NOTRACK
5521 call T1
5522 movq [A0], mm0
5523
5524 IEMIMPL_SSE_EPILOGUE
5525 EPILOGUE_3_ARGS
5526 %assign bImm 0
5527 %rep 256
5528.imm %+ bImm:
5529 IBT_ENDBRxx_WITHOUT_NOTRACK
5530 pinsrw mm0, A1_32, bImm
5531 ret
5532 %assign bImm bImm + 1
5533 %endrep
5534.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5535ENDPROC iemAImpl_pinsrw_u64
5536
5537BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5538 PROLOGUE_3_ARGS
5539 IEMIMPL_SSE_PROLOGUE
5540
5541 movdqu xmm0, [A0]
5542 lea T1, [.imm0 xWrtRIP]
5543 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5544 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5545 %else
5546 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5547 %endif
5548 lea T1, [T1 + T0*2]
5549 IBT_NOTRACK
5550 call T1
5551 movdqu [A0], xmm0
5552
5553 IEMIMPL_SSE_EPILOGUE
5554 EPILOGUE_3_ARGS
5555 %assign bImm 0
5556 %rep 256
5557.imm %+ bImm:
5558 IBT_ENDBRxx_WITHOUT_NOTRACK
5559 pinsrw xmm0, A1_32, bImm
5560 ret
5561 %assign bImm bImm + 1
5562 %endrep
5563.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5564ENDPROC iemAImpl_pinsrw_u128
5565
5566;;
5567; vpinsrw instruction.
5568;
5569; @param A0 Pointer to the first media register size operand (output).
5570; @param A1 Pointer to the source media register size operand (input).
5571; @param A2 The 16 bit input operand (input).
5572; @param A3 The 8-bit immediate
5573;
5574BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5575 PROLOGUE_4_ARGS
5576 IEMIMPL_SSE_PROLOGUE
5577
5578 movdqu xmm0, [A1]
5579 lea T1, [.imm0 xWrtRIP]
5580 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5581 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5582 %else
5583 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5584 %endif
5585 lea T1, [T1 + T0*2]
5586 mov A1, A2 ; A2 requires longer encoding on Windows
5587 IBT_NOTRACK
5588 call T1
5589 movdqu [A0], xmm0
5590
5591 IEMIMPL_SSE_EPILOGUE
5592 EPILOGUE_4_ARGS
5593 %assign bImm 0
5594 %rep 256
5595.imm %+ bImm:
5596 IBT_ENDBRxx_WITHOUT_NOTRACK
5597 vpinsrw xmm0, xmm0, A1_32, bImm
5598 ret
5599 %assign bImm bImm + 1
5600 %endrep
5601.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5602ENDPROC iemAImpl_vpinsrw_u128
5603
5604
5605;;
5606; pextrw instruction.
5607;
5608; @param A0 Pointer to the 16bit output operand (output).
5609; @param A1 Pointer to the media register size operand (input).
5610; @param A2 The 8-bit immediate
5611;
5612BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5613 PROLOGUE_3_ARGS
5614 IEMIMPL_SSE_PROLOGUE
5615
5616 movq mm0, A1
5617 lea T1, [.imm0 xWrtRIP]
5618 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5619 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5620 %else
5621 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5622 %endif
5623 lea T1, [T1 + T0]
5624 IBT_NOTRACK
5625 call T1
5626 mov word [A0], T0_16
5627
5628 IEMIMPL_SSE_EPILOGUE
5629 EPILOGUE_3_ARGS
5630 %assign bImm 0
5631 %rep 256
5632.imm %+ bImm:
5633 IBT_ENDBRxx_WITHOUT_NOTRACK
5634 pextrw T0_32, mm0, bImm
5635 ret
5636 %assign bImm bImm + 1
5637 %endrep
5638.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5639ENDPROC iemAImpl_pextrw_u64
5640
5641BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5642 PROLOGUE_3_ARGS
5643 IEMIMPL_SSE_PROLOGUE
5644
5645 movdqu xmm0, [A1]
5646 lea T1, [.imm0 xWrtRIP]
5647 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5648 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5649 %else
5650 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5651 %endif
5652 lea T1, [T1 + T0*2]
5653 IBT_NOTRACK
5654 call T1
5655 mov word [A0], T0_16
5656
5657 IEMIMPL_SSE_EPILOGUE
5658 EPILOGUE_3_ARGS
5659 %assign bImm 0
5660 %rep 256
5661.imm %+ bImm:
5662 IBT_ENDBRxx_WITHOUT_NOTRACK
5663 pextrw T0_32, xmm0, bImm
5664 ret
5665 %assign bImm bImm + 1
5666 %endrep
5667.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5668ENDPROC iemAImpl_pextrw_u128
5669
5670;;
5671; vpextrw instruction.
5672;
5673; @param A0 Pointer to the 16bit output operand (output).
5674; @param A1 Pointer to the source media register size operand (input).
5675; @param A2 The 8-bit immediate
5676;
5677BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5678 PROLOGUE_3_ARGS
5679 IEMIMPL_SSE_PROLOGUE
5680
5681 movdqu xmm0, [A1]
5682 lea T1, [.imm0 xWrtRIP]
5683 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5684 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5685 %else
5686 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5687 %endif
5688 lea T1, [T1 + T0*2]
5689 IBT_NOTRACK
5690 call T1
5691 mov word [A0], T0_16
5692
5693 IEMIMPL_SSE_EPILOGUE
5694 EPILOGUE_3_ARGS
5695 %assign bImm 0
5696 %rep 256
5697.imm %+ bImm:
5698 IBT_ENDBRxx_WITHOUT_NOTRACK
5699 vpextrw T0_32, xmm0, bImm
5700 ret
5701 %assign bImm bImm + 1
5702 %endrep
5703.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5704ENDPROC iemAImpl_vpextrw_u128
5705
5706
5707;;
5708; movmskp{s,d} SSE instruction template
5709;
5710; @param 1 The SSE instruction name.
5711; @param 2 The AVX instruction name.
5712;
5713; @param A0 Pointer to the output register (output/byte sized).
5714; @param A1 Pointer to the source media register size operand (input).
5715;
5716%macro IEMIMPL_MEDIA_MOVMSK_P 2
5717BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5718 PROLOGUE_2_ARGS
5719 IEMIMPL_SSE_PROLOGUE
5720
5721 movdqu xmm0, [A1]
5722 %1 T0, xmm0
5723 mov byte [A0], T0_8
5724
5725 IEMIMPL_SSE_EPILOGUE
5726 EPILOGUE_2_ARGS
5727ENDPROC iemAImpl_ %+ %1 %+ _u128
5728
5729BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5730 PROLOGUE_2_ARGS
5731 IEMIMPL_AVX_PROLOGUE
5732
5733 movdqu xmm0, [A1]
5734 %2 T0, xmm0
5735 mov byte [A0], T0_8
5736
5737 IEMIMPL_AVX_EPILOGUE
5738 EPILOGUE_2_ARGS
5739ENDPROC iemAImpl_ %+ %2 %+ _u128
5740
5741BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5742 PROLOGUE_2_ARGS
5743 IEMIMPL_AVX_PROLOGUE
5744
5745 vmovdqu ymm0, [A1]
5746 %2 T0, ymm0
5747 mov byte [A0], T0_8
5748
5749 IEMIMPL_AVX_EPILOGUE
5750 EPILOGUE_2_ARGS
5751ENDPROC iemAImpl_ %+ %2 %+ _u256
5752%endmacro
5753
5754IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5755IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5756
5757
5758;;
5759; Restores the SSE MXCSR register with the original value.
5760;
5761; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5762; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5763; @param 2 Expression giving the address of the FXSTATE of the guest.
5764;
5765; @note Restores the stack pointer.
5766;
5767%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5768 sub xSP, 4
5769 stmxcsr [xSP]
5770 mov T0_32, [xSP]
5771 add xSP, 4
5772 ; Merge the status bits into the original MXCSR value.
5773 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5774 and T0_32, X86_MXCSR_XCPT_FLAGS
5775 or T0_32, T1_32
5776 mov [%1], T0_32
5777
5778 ldmxcsr [xSP]
5779 add xSP, 4
5780%endmacro
5781
5782
5783;;
5784; cvttsd2si instruction - 32-bit variant.
5785;
5786; @param A0 FPU context (FXSTATE or XSAVEAREA).
5787; @param A1 Where to return the MXCSR value.
5788; @param A2 Pointer to the result operand (output).
5789; @param A3 Pointer to the second operand (input).
5790;
5791BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5792 PROLOGUE_4_ARGS
5793 IEMIMPL_SSE_PROLOGUE
5794 SSE_LD_FXSTATE_MXCSR A0
5795
5796 cvttsd2si T0_32, [A3]
5797 mov dword [A2], T0_32
5798
5799 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5800 IEMIMPL_SSE_EPILOGUE
5801 EPILOGUE_4_ARGS
5802ENDPROC iemAImpl_cvttsd2si_i32_r64
5803
5804;;
5805; cvttsd2si instruction - 64-bit variant.
5806;
5807; @param A0 FPU context (FXSTATE or XSAVEAREA).
5808; @param A1 Where to return the MXCSR value.
5809; @param A2 Pointer to the result operand (output).
5810; @param A3 Pointer to the second operand (input).
5811;
5812BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5813 PROLOGUE_4_ARGS
5814 IEMIMPL_SSE_PROLOGUE
5815 SSE_LD_FXSTATE_MXCSR A0
5816
5817 cvttsd2si T0, [A3]
5818 mov qword [A2], T0
5819
5820 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5821 IEMIMPL_SSE_EPILOGUE
5822 EPILOGUE_4_ARGS
5823ENDPROC iemAImpl_cvttsd2si_i64_r64
5824
5825
5826;;
5827; cvtsd2si instruction - 32-bit variant.
5828;
5829; @param A0 FPU context (FXSTATE or XSAVEAREA).
5830; @param A1 Where to return the MXCSR value.
5831; @param A2 Pointer to the result operand (output).
5832; @param A3 Pointer to the second operand (input).
5833;
5834BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5835 PROLOGUE_4_ARGS
5836 IEMIMPL_SSE_PROLOGUE
5837 SSE_LD_FXSTATE_MXCSR A0
5838
5839 cvtsd2si T0_32, [A3]
5840 mov dword [A2], T0_32
5841
5842 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5843 IEMIMPL_SSE_EPILOGUE
5844 EPILOGUE_4_ARGS
5845ENDPROC iemAImpl_cvtsd2si_i32_r64
5846
5847;;
5848; cvtsd2si instruction - 64-bit variant.
5849;
5850; @param A0 FPU context (FXSTATE or XSAVEAREA).
5851; @param A1 Where to return the MXCSR value.
5852; @param A2 Pointer to the result operand (output).
5853; @param A3 Pointer to the second operand (input).
5854;
5855BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5856 PROLOGUE_4_ARGS
5857 IEMIMPL_SSE_PROLOGUE
5858 SSE_LD_FXSTATE_MXCSR A0
5859
5860 cvtsd2si T0, [A3]
5861 mov qword [A2], T0
5862
5863 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5864 IEMIMPL_SSE_EPILOGUE
5865 EPILOGUE_4_ARGS
5866ENDPROC iemAImpl_cvtsd2si_i64_r64
5867
5868
5869;;
5870; cvttss2si instruction - 32-bit variant.
5871;
5872; @param A0 FPU context (FXSTATE or XSAVEAREA).
5873; @param A1 Where to return the MXCSR value.
5874; @param A2 Pointer to the result operand (output).
5875; @param A3 Pointer to the second operand (input).
5876;
5877BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5878 PROLOGUE_4_ARGS
5879 IEMIMPL_SSE_PROLOGUE
5880 SSE_LD_FXSTATE_MXCSR A0
5881
5882 cvttss2si T0_32, [A3]
5883 mov dword [A2], T0_32
5884
5885 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5886 IEMIMPL_SSE_EPILOGUE
5887 EPILOGUE_4_ARGS
5888ENDPROC iemAImpl_cvttss2si_i32_r32
5889
5890;;
5891; cvttss2si instruction - 64-bit variant.
5892;
5893; @param A0 FPU context (FXSTATE or XSAVEAREA).
5894; @param A1 Where to return the MXCSR value.
5895; @param A2 Pointer to the result operand (output).
5896; @param A3 Pointer to the second operand (input).
5897;
5898BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5899 PROLOGUE_4_ARGS
5900 IEMIMPL_SSE_PROLOGUE
5901 SSE_LD_FXSTATE_MXCSR A0
5902
5903 cvttss2si T0, [A3]
5904 mov qword [A2], T0
5905
5906 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5907 IEMIMPL_SSE_EPILOGUE
5908 EPILOGUE_4_ARGS
5909ENDPROC iemAImpl_cvttss2si_i64_r32
5910
5911
5912;;
5913; cvtss2si instruction - 32-bit variant.
5914;
5915; @param A0 FPU context (FXSTATE or XSAVEAREA).
5916; @param A1 Where to return the MXCSR value.
5917; @param A2 Pointer to the result operand (output).
5918; @param A3 Pointer to the second operand (input).
5919;
5920BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5921 PROLOGUE_4_ARGS
5922 IEMIMPL_SSE_PROLOGUE
5923 SSE_LD_FXSTATE_MXCSR A0
5924
5925 cvtss2si T0_32, [A3]
5926 mov dword [A2], T0_32
5927
5928 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5929 IEMIMPL_SSE_EPILOGUE
5930 EPILOGUE_4_ARGS
5931ENDPROC iemAImpl_cvtss2si_i32_r32
5932
5933;;
5934; cvtss2si instruction - 64-bit variant.
5935;
5936; @param A0 FPU context (FXSTATE or XSAVEAREA).
5937; @param A1 Where to return the MXCSR value.
5938; @param A2 Pointer to the result operand (output).
5939; @param A3 Pointer to the second operand (input).
5940;
5941BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5942 PROLOGUE_4_ARGS
5943 IEMIMPL_SSE_PROLOGUE
5944 SSE_LD_FXSTATE_MXCSR A0
5945
5946 cvtss2si T0, [A3]
5947 mov qword [A2], T0
5948
5949 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5950 IEMIMPL_SSE_EPILOGUE
5951 EPILOGUE_4_ARGS
5952ENDPROC iemAImpl_cvtss2si_i64_r32
5953
5954
5955;;
5956; cvtsi2ss instruction - 32-bit variant.
5957;
5958; @param A0 FPU context (FXSTATE or XSAVEAREA).
5959; @param A1 Where to return the MXCSR value.
5960; @param A2 Pointer to the result operand (output).
5961; @param A3 Pointer to the second operand (input).
5962;
5963BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5964 PROLOGUE_4_ARGS
5965 IEMIMPL_SSE_PROLOGUE
5966 SSE_LD_FXSTATE_MXCSR A0
5967
5968 cvtsi2ss xmm0, dword [A3]
5969 movd dword [A2], xmm0
5970
5971 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5972 IEMIMPL_SSE_EPILOGUE
5973 EPILOGUE_4_ARGS
5974ENDPROC iemAImpl_cvtsi2ss_r32_i32
5975
5976;;
5977; cvtsi2ss instruction - 64-bit variant.
5978;
5979; @param A0 FPU context (FXSTATE or XSAVEAREA).
5980; @param A1 Where to return the MXCSR value.
5981; @param A2 Pointer to the result operand (output).
5982; @param A3 Pointer to the second operand (input).
5983;
5984BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5985 PROLOGUE_4_ARGS
5986 IEMIMPL_SSE_PROLOGUE
5987 SSE_LD_FXSTATE_MXCSR A0
5988
5989 cvtsi2ss xmm0, qword [A3]
5990 movd dword [A2], xmm0
5991
5992 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5993 IEMIMPL_SSE_EPILOGUE
5994 EPILOGUE_4_ARGS
5995ENDPROC iemAImpl_cvtsi2ss_r32_i64
5996
5997
5998;;
5999; cvtsi2sd instruction - 32-bit variant.
6000;
6001; @param A0 FPU context (FXSTATE or XSAVEAREA).
6002; @param A1 Where to return the MXCSR value.
6003; @param A2 Pointer to the result operand (output).
6004; @param A3 Pointer to the second operand (input).
6005;
6006BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6007 PROLOGUE_4_ARGS
6008 IEMIMPL_SSE_PROLOGUE
6009 SSE_LD_FXSTATE_MXCSR A0
6010
6011 cvtsi2sd xmm0, dword [A3]
6012 movq [A2], xmm0
6013
6014 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6015 IEMIMPL_SSE_EPILOGUE
6016 EPILOGUE_4_ARGS
6017ENDPROC iemAImpl_cvtsi2sd_r64_i32
6018
6019;;
6020; cvtsi2sd instruction - 64-bit variant.
6021;
6022; @param A0 FPU context (FXSTATE or XSAVEAREA).
6023; @param A1 Where to return the MXCSR value.
6024; @param A2 Pointer to the result operand (output).
6025; @param A3 Pointer to the second operand (input).
6026;
6027BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6028 PROLOGUE_4_ARGS
6029 IEMIMPL_SSE_PROLOGUE
6030 SSE_LD_FXSTATE_MXCSR A0
6031
6032 cvtsi2sd xmm0, qword [A3]
6033 movq [A2], xmm0
6034
6035 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6036 IEMIMPL_SSE_EPILOGUE
6037 EPILOGUE_4_ARGS
6038ENDPROC iemAImpl_cvtsi2sd_r64_i64
6039
6040
6041;;
6042; Initialize the SSE MXCSR register using the guest value partially to
6043; account for rounding mode.
6044;
6045; @uses 4 bytes of stack to save the original value, T0.
6046; @param 1 Expression giving the address of the MXCSR register of the guest.
6047;
6048%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6049 sub xSP, 4
6050
6051 stmxcsr [xSP]
6052 mov T0_32, [%1]
6053 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6054 or T0_32, X86_MXCSR_XCPT_MASK
6055 sub xSP, 4
6056 mov [xSP], T0_32
6057 ldmxcsr [xSP]
6058 add xSP, 4
6059%endmacro
6060
6061
6062;;
6063; Restores the SSE MXCSR register with the original value.
6064;
6065; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6066; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6067;
6068; @note Restores the stack pointer.
6069;
6070%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6071 sub xSP, 4
6072 stmxcsr [xSP]
6073 mov T0_32, [xSP]
6074 add xSP, 4
6075 ; Merge the status bits into the original MXCSR value.
6076 mov T1_32, [%1]
6077 and T0_32, X86_MXCSR_XCPT_FLAGS
6078 or T0_32, T1_32
6079 mov [%1], T0_32
6080
6081 ldmxcsr [xSP]
6082 add xSP, 4
6083%endmacro
6084
6085
6086;
6087; UCOMISS (SSE)
6088;
6089; @param A0 Pointer to the MXCSR value (input/output).
6090; @param A1 Pointer to the EFLAGS value (input/output).
6091; @param A2 Pointer to the first source operand (aka readonly destination).
6092; @param A3 Pointer to the second source operand.
6093;
6094BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6095 PROLOGUE_4_ARGS
6096 IEMIMPL_SSE_PROLOGUE
6097 SSE_LD_FXSTATE_MXCSR_ONLY A0
6098
6099 movdqu xmm0, [A2]
6100 movdqu xmm1, [A3]
6101 ucomiss xmm0, xmm1
6102 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6103
6104 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6105 IEMIMPL_SSE_EPILOGUE
6106 EPILOGUE_4_ARGS
6107ENDPROC iemAImpl_ucomiss_u128
6108
6109BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6110 PROLOGUE_4_ARGS
6111 IEMIMPL_SSE_PROLOGUE
6112 SSE_LD_FXSTATE_MXCSR_ONLY A0
6113
6114 movdqu xmm0, [A2]
6115 movdqu xmm1, [A3]
6116 vucomiss xmm0, xmm1
6117 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6118
6119 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6120 IEMIMPL_SSE_EPILOGUE
6121 EPILOGUE_4_ARGS
6122ENDPROC iemAImpl_vucomiss_u128
6123
6124
6125;
6126; UCOMISD (SSE)
6127;
6128; @param A0 Pointer to the MXCSR value (input/output).
6129; @param A1 Pointer to the EFLAGS value (input/output).
6130; @param A2 Pointer to the first source operand (aka readonly destination).
6131; @param A3 Pointer to the second source operand.
6132;
6133BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6134 PROLOGUE_4_ARGS
6135 IEMIMPL_SSE_PROLOGUE
6136 SSE_LD_FXSTATE_MXCSR_ONLY A0
6137
6138 movdqu xmm0, [A2]
6139 movdqu xmm1, [A3]
6140 ucomisd xmm0, xmm1
6141 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6142
6143 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6144 IEMIMPL_SSE_EPILOGUE
6145 EPILOGUE_4_ARGS
6146ENDPROC iemAImpl_ucomisd_u128
6147
6148BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6149 PROLOGUE_4_ARGS
6150 IEMIMPL_SSE_PROLOGUE
6151 SSE_LD_FXSTATE_MXCSR_ONLY A0
6152
6153 movdqu xmm0, [A2]
6154 movdqu xmm1, [A3]
6155 vucomisd xmm0, xmm1
6156 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6157
6158 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6159 IEMIMPL_SSE_EPILOGUE
6160 EPILOGUE_4_ARGS
6161ENDPROC iemAImpl_vucomisd_u128
6162
6163;
6164; COMISS (SSE)
6165;
6166; @param A0 Pointer to the MXCSR value (input/output).
6167; @param A1 Pointer to the EFLAGS value (input/output).
6168; @param A2 Pointer to the first source operand (aka readonly destination).
6169; @param A3 Pointer to the second source operand.
6170;
6171BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6172 PROLOGUE_4_ARGS
6173 IEMIMPL_SSE_PROLOGUE
6174 SSE_LD_FXSTATE_MXCSR_ONLY A0
6175
6176 movdqu xmm0, [A2]
6177 movdqu xmm1, [A3]
6178 comiss xmm0, xmm1
6179 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6180
6181 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6182 IEMIMPL_SSE_EPILOGUE
6183 EPILOGUE_4_ARGS
6184ENDPROC iemAImpl_comiss_u128
6185
6186BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6187 PROLOGUE_4_ARGS
6188 IEMIMPL_SSE_PROLOGUE
6189 SSE_LD_FXSTATE_MXCSR_ONLY A0
6190
6191 movdqu xmm0, [A2]
6192 movdqu xmm1, [A3]
6193 vcomiss xmm0, xmm1
6194 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6195
6196 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6197 IEMIMPL_SSE_EPILOGUE
6198 EPILOGUE_4_ARGS
6199ENDPROC iemAImpl_vcomiss_u128
6200
6201
6202;
6203; COMISD (SSE)
6204;
6205; @param A0 Pointer to the MXCSR value (input/output).
6206; @param A1 Pointer to the EFLAGS value (input/output).
6207; @param A2 Pointer to the first source operand (aka readonly destination).
6208; @param A3 Pointer to the second source operand.
6209;
6210BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6211 PROLOGUE_4_ARGS
6212 IEMIMPL_SSE_PROLOGUE
6213 SSE_LD_FXSTATE_MXCSR_ONLY A0
6214
6215 movdqu xmm0, [A2]
6216 movdqu xmm1, [A3]
6217 comisd xmm0, xmm1
6218 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6219
6220 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6221 IEMIMPL_SSE_EPILOGUE
6222 EPILOGUE_4_ARGS
6223ENDPROC iemAImpl_comisd_u128
6224
6225BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6226 PROLOGUE_4_ARGS
6227 IEMIMPL_SSE_PROLOGUE
6228 SSE_LD_FXSTATE_MXCSR_ONLY A0
6229
6230 movdqu xmm0, [A2]
6231 movdqu xmm1, [A3]
6232 vcomisd xmm0, xmm1
6233 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6234
6235 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6236 IEMIMPL_SSE_EPILOGUE
6237 EPILOGUE_4_ARGS
6238ENDPROC iemAImpl_vcomisd_u128
6239
6240
6241;;
6242; Need to move this as well somewhere better?
6243;
6244struc IEMMEDIAF2XMMSRC
6245 .uSrc1 resd 4
6246 .uSrc2 resd 4
6247endstruc
6248
6249
6250;
6251; CMPPS (SSE)
6252;
6253; @param A0 Pointer to the MXCSR value (input/output).
6254; @param A1 Pointer to the first media register size operand (output).
6255; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6256; @param A3 The 8-bit immediate (input).
6257;
6258BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6259 PROLOGUE_4_ARGS
6260 IEMIMPL_SSE_PROLOGUE
6261 SSE_LD_FXSTATE_MXCSR_ONLY A0
6262
6263 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6264 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6265 lea T1, [.imm0 xWrtRIP]
6266 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6267 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6268 %else
6269 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6270 %endif
6271 lea T1, [T1 + T0]
6272 IBT_NOTRACK
6273 call T1
6274 movdqu [A1], xmm0
6275
6276 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6277 IEMIMPL_SSE_EPILOGUE
6278 EPILOGUE_4_ARGS
6279 %assign bImm 0
6280 %rep 256
6281.imm %+ bImm:
6282 IBT_ENDBRxx_WITHOUT_NOTRACK
6283 cmpps xmm0, xmm1, bImm
6284 ret
6285 %assign bImm bImm + 1
6286 %endrep
6287.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6288ENDPROC iemAImpl_cmpps_u128
6289
6290;;
6291; SSE instructions with 8-bit immediates of the form
6292; xxx xmm1, xmm2, imm8.
6293; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6294; register.
6295;
6296; @param 1 The instruction name.
6297;
6298; @param A0 Pointer to the MXCSR value (input/output).
6299; @param A1 Pointer to the first media register size operand (output).
6300; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6301; @param A3 The 8-bit immediate (input).
6302;
6303%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6305 PROLOGUE_4_ARGS
6306 IEMIMPL_SSE_PROLOGUE
6307 SSE_LD_FXSTATE_MXCSR_ONLY A0
6308
6309 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6310 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6311 lea T1, [.imm0 xWrtRIP]
6312 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6313 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6314 %else
6315 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6316 %endif
6317 lea T1, [T1 + T0*2]
6318 IBT_NOTRACK
6319 call T1
6320 movdqu [A1], xmm0
6321
6322 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6323 IEMIMPL_SSE_EPILOGUE
6324 EPILOGUE_4_ARGS
6325 %assign bImm 0
6326 %rep 256
6327.imm %+ bImm:
6328 IBT_ENDBRxx_WITHOUT_NOTRACK
6329 %1 xmm0, xmm1, bImm
6330 ret
6331 %assign bImm bImm + 1
6332 %endrep
6333.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6334ENDPROC iemAImpl_ %+ %1 %+ _u128
6335%endmacro
6336
6337IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6338IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6339IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6340
6341;;
6342; SSE instructions with 8-bit immediates of the form
6343; xxx xmm1, xmm2, imm8.
6344; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6345; register.
6346;
6347; @param 1 The instruction name.
6348;
6349; @param A0 Pointer to the MXCSR value (input/output).
6350; @param A1 Pointer to the first media register size operand (output).
6351; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6352; @param A3 The 8-bit immediate (input).
6353;
6354%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6355BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6356 PROLOGUE_4_ARGS
6357 IEMIMPL_SSE_PROLOGUE
6358 SSE_LD_FXSTATE_MXCSR_ONLY A0
6359
6360 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6361 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6362 lea T1, [.imm0 xWrtRIP]
6363 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6364 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6365 lea T1, [T1 + T0*4]
6366 %else
6367 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6368 %endif
6369 IBT_NOTRACK
6370 call T1
6371 movdqu [A1], xmm0
6372
6373 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6374 IEMIMPL_SSE_EPILOGUE
6375 EPILOGUE_4_ARGS
6376 %assign bImm 0
6377 %rep 256
6378.imm %+ bImm:
6379 IBT_ENDBRxx_WITHOUT_NOTRACK
6380 %1 xmm0, xmm1, bImm
6381 ret
6382 int3
6383 %assign bImm bImm + 1
6384 %endrep
6385.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6386ENDPROC iemAImpl_ %+ %1 %+ _u128
6387%endmacro
6388
6389IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6390IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6391IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6392IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6393IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6394IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6395
6396
6397;;
6398; SSE instructions of the form
6399; xxx mm, xmm.
6400; and we need to load and save the MXCSR register.
6401;
6402; @param 1 The instruction name.
6403;
6404; @param A0 Pointer to the MXCSR value (input/output).
6405; @param A1 Pointer to the first MMX register sized operand (output).
6406; @param A2 Pointer to the media register sized operand (input).
6407;
6408%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6409BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6410 PROLOGUE_3_ARGS
6411 IEMIMPL_SSE_PROLOGUE
6412 SSE_LD_FXSTATE_MXCSR_ONLY A0
6413
6414 movdqu xmm0, [A2]
6415 %1 mm0, xmm0
6416 movq [A1], mm0
6417
6418 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6419 IEMIMPL_SSE_EPILOGUE
6420 EPILOGUE_3_ARGS
6421ENDPROC iemAImpl_ %+ %1 %+ _u128
6422%endmacro
6423
6424IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6425IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6426
6427;;
6428; SSE instructions of the form
6429; xxx xmm, xmm/m64.
6430; and we need to load and save the MXCSR register.
6431;
6432; @param 1 The instruction name.
6433;
6434; @param A0 Pointer to the MXCSR value (input/output).
6435; @param A1 Pointer to the first media register sized operand (input/output).
6436; @param A2 The 64bit source value from a MMX media register (input)
6437;
6438%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6440 PROLOGUE_3_ARGS
6441 IEMIMPL_SSE_PROLOGUE
6442 SSE_LD_FXSTATE_MXCSR_ONLY A0
6443
6444 movdqu xmm0, [A1]
6445 movq mm0, A2
6446 %1 xmm0, mm0
6447 movdqu [A1], xmm0
6448
6449 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6450 IEMIMPL_SSE_EPILOGUE
6451 EPILOGUE_3_ARGS
6452ENDPROC iemAImpl_ %+ %1 %+ _u128
6453%endmacro
6454
6455IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6456IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6457
6458;;
6459; SSE instructions of the form
6460; xxx mm, xmm/m64.
6461; and we need to load and save the MXCSR register.
6462;
6463; @param 1 The instruction name.
6464;
6465; @param A0 Pointer to the MXCSR value (input/output).
6466; @param A1 Pointer to the first MMX media register sized operand (output).
6467; @param A2 The 64bit source value (input).
6468;
6469%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6470BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6471 PROLOGUE_3_ARGS
6472 IEMIMPL_SSE_PROLOGUE
6473 SSE_LD_FXSTATE_MXCSR_ONLY A0
6474
6475 movq xmm0, A2
6476 %1 mm0, xmm0
6477 movq [A1], mm0
6478
6479 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6480 IEMIMPL_SSE_EPILOGUE
6481 EPILOGUE_3_ARGS
6482ENDPROC iemAImpl_ %+ %1 %+ _u128
6483%endmacro
6484
6485IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6486IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6487
6488;
6489; All forms of RDRAND and RDSEED
6490;
6491; @param A0 Pointer to the destination operand.
6492; @param A1 Pointer to the EFLAGS value (input/output).
6493;
6494%macro IEMIMPL_RDRAND_RDSEED 3
6495BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6496 PROLOGUE_2_ARGS
6497
6498 %1 %2
6499 mov [A0], %2
6500 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6501
6502 EPILOGUE_2_ARGS
6503ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6504%endmacro
6505
6506IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6507IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6508IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6509IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6510IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6511IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6512
6513
6514;;
6515; sha1rnds4 xmm1, xmm2, imm8.
6516;
6517; @param 1 The instruction name.
6518;
6519; @param A0 Pointer to the first media register size operand (input/output).
6520; @param A1 Pointer to the second source media register size operand (input).
6521; @param A2 The 8-bit immediate
6522;
6523BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6524 PROLOGUE_3_ARGS
6525 IEMIMPL_SSE_PROLOGUE
6526
6527 movdqu xmm0, [A0]
6528 movdqu xmm1, [A1]
6529 lea T1, [.imm0 xWrtRIP]
6530 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6531 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6532 %else
6533 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6534 %endif
6535 lea T1, [T1 + T0*2]
6536 IBT_NOTRACK
6537 call T1
6538 movdqu [A0], xmm0
6539
6540 IEMIMPL_SSE_EPILOGUE
6541 EPILOGUE_3_ARGS
6542 %assign bImm 0
6543 %rep 256
6544.imm %+ bImm:
6545 IBT_ENDBRxx_WITHOUT_NOTRACK
6546 sha1rnds4 xmm0, xmm1, bImm
6547 ret
6548 %assign bImm bImm + 1
6549 %endrep
6550.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6551ENDPROC iemAImpl_sha1rnds4_u128
6552
6553
6554;;
6555; sha256rnds2 xmm1, xmm2, <XMM0>.
6556;
6557; @param 1 The instruction name.
6558;
6559; @param A0 Pointer to the first media register size operand (input/output).
6560; @param A1 Pointer to the second source media register size operand (input).
6561; @param A2 Pointer to the implicit XMM0 constants (input).
6562;
6563BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6564 PROLOGUE_3_ARGS
6565 IEMIMPL_SSE_PROLOGUE
6566
6567 movdqu xmm0, [A2]
6568 movdqu xmm1, [A0]
6569 movdqu xmm2, [A1]
6570 sha256rnds2 xmm1, xmm2
6571 movdqu [A0], xmm1
6572
6573 IEMIMPL_SSE_EPILOGUE
6574 EPILOGUE_3_ARGS
6575ENDPROC iemAImpl_sha256rnds2_u128
6576
6577
6578;
6579; 32-bit forms of ADCX and ADOX
6580;
6581; @param A0 Pointer to the destination operand (input/output).
6582; @param A1 Pointer to the EFLAGS value (input/output).
6583; @param A2 32-bit source operand 1 (input).
6584;
6585%macro IEMIMPL_ADX_32 2
6586BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6587 PROLOGUE_4_ARGS
6588
6589 IEM_LOAD_FLAGS A1, %2, 0
6590 %1 A2_32, [A0]
6591 mov [A0], A2_32
6592 IEM_SAVE_FLAGS A1, %2, 0
6593
6594 EPILOGUE_4_ARGS
6595ENDPROC iemAImpl_ %+ %1 %+ _u32
6596%endmacro
6597
6598;
6599; 64-bit forms of ADCX and ADOX
6600;
6601; @param A0 Pointer to the destination operand (input/output).
6602; @param A1 Pointer to the EFLAGS value (input/output).
6603; @param A2 64-bit source operand 1 (input).
6604;
6605%macro IEMIMPL_ADX_64 2
6606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6607 PROLOGUE_4_ARGS
6608
6609 IEM_LOAD_FLAGS A1, %2, 0
6610 %1 A2, [A0]
6611 mov [A0], A2
6612 IEM_SAVE_FLAGS A1, %2, 0
6613
6614 EPILOGUE_4_ARGS
6615ENDPROC iemAImpl_ %+ %1 %+ _u64
6616%endmacro
6617
6618IEMIMPL_ADX_32 adcx, X86_EFL_CF
6619IEMIMPL_ADX_64 adcx, X86_EFL_CF
6620
6621IEMIMPL_ADX_32 adox, X86_EFL_OF
6622IEMIMPL_ADX_64 adox, X86_EFL_OF
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette