VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 96860

Last change on this file since 96860 was 96796, checked in by vboxsync, 20 months ago

VMM/IEM: Implement cvtdq2ps/cvtps2dq/cvttps2dq/cvttpd2dq/cvtdq2pd/cvtpd2dq instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 171.2 KB
Line 
1; $Id: IEMAllAImpl.asm 96796 2022-09-19 19:09:53Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2022 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Update the flag.
305;
306; @remarks Clobbers T0, T1, stack.
307; @param 1 The register pointing to the EFLAGS.
308; @param 2 The mask of modified flags to save.
309; @param 3 The mask of undefined flags to (maybe) save.
310;
311%macro IEM_SAVE_FLAGS 3
312 %if (%2 | %3) != 0
313 pushf
314 pop T1
315 mov T0_32, [%1] ; flags
316 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
317 and T1_32, (%2 | %3) ; select the modified and undefined flags.
318 or T0_32, T1_32 ; combine the flags.
319 mov [%1], T0_32 ; save the flags.
320 %endif
321%endmacro
322
323;;
324; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
325;
326; @remarks Clobbers T0, T1, stack.
327; @param 1 The register pointing to the EFLAGS.
328; @param 2 The mask of modified flags to save.
329; @param 3 Mask of additional flags to always clear
330; @param 4 Mask of additional flags to always set.
331;
332%macro IEM_SAVE_AND_ADJUST_FLAGS 4
333 %if (%2 | %3 | %4) != 0
334 pushf
335 pop T1
336 mov T0_32, [%1] ; load flags.
337 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
338 and T1_32, (%2) ; select the modified flags.
339 or T0_32, T1_32 ; combine the flags.
340 %if (%4) != 0
341 or T0_32, %4 ; add the always set flags.
342 %endif
343 mov [%1], T0_32 ; save the result.
344 %endif
345%endmacro
346
347;;
348; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
349; signed input (%4[%5]) and parity index (%6).
350;
351; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
352; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
353; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
354;
355; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
356; @param 1 The register pointing to the EFLAGS.
357; @param 2 The mask of modified flags to save.
358; @param 3 Mask of additional flags to always clear
359; @param 4 The result register to set SF by.
360; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
361; @param 6 The (full) register containing the parity table index. Will be modified!
362
363%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
364 %ifdef RT_ARCH_AMD64
365 pushf
366 pop T2
367 %else
368 push T0
369 pushf
370 pop T0
371 %endif
372 mov T1_32, [%1] ; load flags.
373 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
374 %ifdef RT_ARCH_AMD64
375 and T2_32, (%2) ; select the modified flags.
376 or T1_32, T2_32 ; combine the flags.
377 %else
378 and T0_32, (%2) ; select the modified flags.
379 or T1_32, T0_32 ; combine the flags.
380 pop T0
381 %endif
382
383 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
384 bt %4, %5 - 1
385 jnc %%sf_clear
386 or T1_32, X86_EFL_SF
387 %%sf_clear:
388
389 ; Parity last.
390 and %6, 0xff
391 %ifdef RT_ARCH_AMD64
392 lea T2, [NAME(g_afParity) xWrtRIP]
393 or T1_8, [T2 + %6]
394 %else
395 or T1_8, [NAME(g_afParity) + %6]
396 %endif
397
398 mov [%1], T1_32 ; save the result.
399%endmacro
400
401;;
402; Calculates the new EFLAGS using fixed clear and set bit masks.
403;
404; @remarks Clobbers T0.
405; @param 1 The register pointing to the EFLAGS.
406; @param 2 Mask of additional flags to always clear
407; @param 3 Mask of additional flags to always set.
408;
409%macro IEM_ADJUST_FLAGS 3
410 %if (%2 | %3) != 0
411 mov T0_32, [%1] ; Load flags.
412 %if (%2) != 0
413 and T0_32, ~(%2) ; Remove the always cleared flags.
414 %endif
415 %if (%3) != 0
416 or T0_32, %3 ; Add the always set flags.
417 %endif
418 mov [%1], T0_32 ; Save the result.
419 %endif
420%endmacro
421
422;;
423; Calculates the new EFLAGS using fixed clear and set bit masks.
424;
425; @remarks Clobbers T0, %4, EFLAGS.
426; @param 1 The register pointing to the EFLAGS.
427; @param 2 Mask of additional flags to always clear
428; @param 3 Mask of additional flags to always set.
429; @param 4 The (full) register containing the parity table index. Will be modified!
430;
431%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
432 mov T0_32, [%1] ; Load flags.
433 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 and %4, 0xff
438 %ifdef RT_ARCH_AMD64
439 lea T2, [NAME(g_afParity) xWrtRIP]
440 or T0_8, [T2 + %4]
441 %else
442 or T0_8, [NAME(g_afParity) + %4]
443 %endif
444 mov [%1], T0_32 ; Save the result.
445%endmacro
446
447
448;*********************************************************************************************************************************
449;* External Symbols *
450;*********************************************************************************************************************************
451extern NAME(g_afParity)
452
453
454;;
455; Macro for implementing a binary operator.
456;
457; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
458; variants, except on 32-bit system where the 64-bit accesses requires hand
459; coding.
460;
461; All the functions takes a pointer to the destination memory operand in A0,
462; the source register operand in A1 and a pointer to eflags in A2.
463;
464; @param 1 The instruction mnemonic.
465; @param 2 Non-zero if there should be a locked version.
466; @param 3 The modified flags.
467; @param 4 The undefined flags.
468;
469%macro IEMIMPL_BIN_OP 4
470BEGINCODE
471BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
472 PROLOGUE_3_ARGS
473 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
474 %1 byte [A0], A1_8
475 IEM_SAVE_FLAGS A2, %3, %4
476 EPILOGUE_3_ARGS
477ENDPROC iemAImpl_ %+ %1 %+ _u8
478
479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
480 PROLOGUE_3_ARGS
481 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
482 %1 word [A0], A1_16
483 IEM_SAVE_FLAGS A2, %3, %4
484 EPILOGUE_3_ARGS
485ENDPROC iemAImpl_ %+ %1 %+ _u16
486
487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
488 PROLOGUE_3_ARGS
489 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
490 %1 dword [A0], A1_32
491 IEM_SAVE_FLAGS A2, %3, %4
492 EPILOGUE_3_ARGS
493ENDPROC iemAImpl_ %+ %1 %+ _u32
494
495 %ifdef RT_ARCH_AMD64
496BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
497 PROLOGUE_3_ARGS
498 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
499 %1 qword [A0], A1
500 IEM_SAVE_FLAGS A2, %3, %4
501 EPILOGUE_3_ARGS_EX 8
502ENDPROC iemAImpl_ %+ %1 %+ _u64
503 %endif ; RT_ARCH_AMD64
504
505 %if %2 != 0 ; locked versions requested?
506
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 lock %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 lock %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 lock %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 lock %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
539 %endif ; RT_ARCH_AMD64
540 %endif ; locked
541%endmacro
542
543; instr,lock, modified-flags, undefined flags
544IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
545IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
546IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
547IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
548IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
549IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
550IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
551IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
552IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
553
554
555;;
556; Macro for implementing a binary operator, VEX variant with separate input/output.
557;
558; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
559; where the 64-bit accesses requires hand coding.
560;
561; All the functions takes a pointer to the destination memory operand in A0,
562; the first source register operand in A1, the second source register operand
563; in A2 and a pointer to eflags in A3.
564;
565; @param 1 The instruction mnemonic.
566; @param 2 The modified flags.
567; @param 3 The undefined flags.
568;
569%macro IEMIMPL_VEX_BIN_OP 3
570BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
571 PROLOGUE_4_ARGS
572 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
573 %1 T0_32, A1_32, A2_32
574 mov [A0], T0_32
575 IEM_SAVE_FLAGS A3, %2, %3
576 EPILOGUE_4_ARGS
577ENDPROC iemAImpl_ %+ %1 %+ _u32
578
579 %ifdef RT_ARCH_AMD64
580BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
581 PROLOGUE_4_ARGS
582 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
583 %1 T0, A1, A2
584 mov [A0], T0
585 IEM_SAVE_FLAGS A3, %2, %3
586 EPILOGUE_4_ARGS
587ENDPROC iemAImpl_ %+ %1 %+ _u64
588 %endif ; RT_ARCH_AMD64
589%endmacro
590
591; instr, modified-flags, undefined-flags
592IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
593IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
594IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
595
596;;
597; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
598;
599; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
600; where the 64-bit accesses requires hand coding.
601;
602; All the functions takes a pointer to the destination memory operand in A0,
603; the source register operand in A1 and a pointer to eflags in A2.
604;
605; @param 1 The instruction mnemonic.
606; @param 2 The modified flags.
607; @param 3 The undefined flags.
608;
609%macro IEMIMPL_VEX_BIN_OP_2 3
610BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
611 PROLOGUE_4_ARGS
612 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
613 mov T0_32, [A0]
614 %1 T0_32, A1_32
615 mov [A0], T0_32
616 IEM_SAVE_FLAGS A2, %2, %3
617 EPILOGUE_4_ARGS
618ENDPROC iemAImpl_ %+ %1 %+ _u32
619
620 %ifdef RT_ARCH_AMD64
621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
622 PROLOGUE_4_ARGS
623 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
624 mov T0, [A0]
625 %1 T0, A1
626 mov [A0], T0
627 IEM_SAVE_FLAGS A2, %2, %3
628 EPILOGUE_4_ARGS
629ENDPROC iemAImpl_ %+ %1 %+ _u64
630 %endif ; RT_ARCH_AMD64
631%endmacro
632
633; instr, modified-flags, undefined-flags
634IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
635IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
636IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
637
638
639;;
640; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
641;
642; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
643; where the 64-bit accesses requires hand coding.
644;
645; All the functions takes a pointer to the destination memory operand in A0,
646; the first source register operand in A1, the second source register operand
647; in A2 and a pointer to eflags in A3.
648;
649; @param 1 The instruction mnemonic.
650; @param 2 Fallback instruction if applicable.
651; @param 3 Whether to emit fallback or not.
652;
653%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
654BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
655 PROLOGUE_3_ARGS
656 %1 T0_32, A1_32, A2_32
657 mov [A0], T0_32
658 EPILOGUE_3_ARGS
659ENDPROC iemAImpl_ %+ %1 %+ _u32
660
661 %if %3
662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
663 PROLOGUE_3_ARGS
664 %ifdef ASM_CALL64_GCC
665 mov cl, A2_8
666 %2 A1_32, cl
667 mov [A0], A1_32
668 %else
669 xchg A2, A0
670 %2 A1_32, cl
671 mov [A2], A1_32
672 %endif
673 EPILOGUE_3_ARGS
674ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
675 %endif
676
677 %ifdef RT_ARCH_AMD64
678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
679 PROLOGUE_3_ARGS
680 %1 T0, A1, A2
681 mov [A0], T0
682 EPILOGUE_3_ARGS
683ENDPROC iemAImpl_ %+ %1 %+ _u64
684
685 %if %3
686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
687 PROLOGUE_3_ARGS
688 %ifdef ASM_CALL64_GCC
689 mov cl, A2_8
690 %2 A1, cl
691 mov [A0], A1_32
692 %else
693 xchg A2, A0
694 %2 A1, cl
695 mov [A2], A1_32
696 %endif
697 mov [A0], A1
698 EPILOGUE_3_ARGS
699ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
700 %endif
701 %endif ; RT_ARCH_AMD64
702%endmacro
703
704; instr, fallback instr, emit fallback
705IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
706IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
707IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
708IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
709IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
710
711
712;
713; RORX uses a immediate byte for the shift count, so we only do
714; fallback implementation of that one.
715;
716BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
717 PROLOGUE_3_ARGS
718 %ifdef ASM_CALL64_GCC
719 mov cl, A2_8
720 ror A1_32, cl
721 mov [A0], A1_32
722 %else
723 xchg A2, A0
724 ror A1_32, cl
725 mov [A2], A1_32
726 %endif
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_rorx_u32
729
730 %ifdef RT_ARCH_AMD64
731BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
732 PROLOGUE_3_ARGS
733 %ifdef ASM_CALL64_GCC
734 mov cl, A2_8
735 ror A1, cl
736 mov [A0], A1_32
737 %else
738 xchg A2, A0
739 ror A1, cl
740 mov [A2], A1_32
741 %endif
742 mov [A0], A1
743 EPILOGUE_3_ARGS
744ENDPROC iemAImpl_rorx_u64
745 %endif ; RT_ARCH_AMD64
746
747
748;
749; MULX
750;
751BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
752 PROLOGUE_4_ARGS
753%ifdef ASM_CALL64_GCC
754 ; A2_32 is EDX - prefect
755 mulx T0_32, T1_32, A3_32
756 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
757 mov [A0], T0_32
758%else
759 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
760 xchg A1, A2
761 mulx T0_32, T1_32, A3_32
762 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
763 mov [A0], T0_32
764%endif
765 EPILOGUE_4_ARGS
766ENDPROC iemAImpl_mulx_u32
767
768
769BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX, T0_32 is EAX
773 mov eax, A3_32
774 mul A2_32
775 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
776 mov [A0], edx
777%else
778 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
779 xchg A1, A2
780 mov eax, A3_32
781 mul A2_32
782 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], edx
784%endif
785 EPILOGUE_4_ARGS
786ENDPROC iemAImpl_mulx_u32_fallback
787
788%ifdef RT_ARCH_AMD64
789BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
790 PROLOGUE_4_ARGS
791%ifdef ASM_CALL64_GCC
792 ; A2 is RDX - prefect
793 mulx T0, T1, A3
794 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
795 mov [A0], T0
796%else
797 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
798 xchg A1, A2
799 mulx T0, T1, A3
800 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], T0
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u64
805
806
807BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX, T0 is RAX
811 mov rax, A3
812 mul A2
813 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
814 mov [A0], rdx
815%else
816 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
817 xchg A1, A2
818 mov rax, A3
819 mul A2
820 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], rdx
822%endif
823 EPILOGUE_4_ARGS
824ENDPROC iemAImpl_mulx_u64_fallback
825
826%endif
827
828
829;;
830; Macro for implementing a bit operator.
831;
832; This will generate code for the 16, 32 and 64 bit accesses with locked
833; variants, except on 32-bit system where the 64-bit accesses requires hand
834; coding.
835;
836; All the functions takes a pointer to the destination memory operand in A0,
837; the source register operand in A1 and a pointer to eflags in A2.
838;
839; @param 1 The instruction mnemonic.
840; @param 2 Non-zero if there should be a locked version.
841; @param 3 The modified flags.
842; @param 4 The undefined flags.
843;
844%macro IEMIMPL_BIT_OP 4
845BEGINCODE
846BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
847 PROLOGUE_3_ARGS
848 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
849 %1 word [A0], A1_16
850 IEM_SAVE_FLAGS A2, %3, %4
851 EPILOGUE_3_ARGS
852ENDPROC iemAImpl_ %+ %1 %+ _u16
853
854BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
855 PROLOGUE_3_ARGS
856 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
857 %1 dword [A0], A1_32
858 IEM_SAVE_FLAGS A2, %3, %4
859 EPILOGUE_3_ARGS
860ENDPROC iemAImpl_ %+ %1 %+ _u32
861
862 %ifdef RT_ARCH_AMD64
863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
864 PROLOGUE_3_ARGS
865 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
866 %1 qword [A0], A1
867 IEM_SAVE_FLAGS A2, %3, %4
868 EPILOGUE_3_ARGS_EX 8
869ENDPROC iemAImpl_ %+ %1 %+ _u64
870 %endif ; RT_ARCH_AMD64
871
872 %if %2 != 0 ; locked versions requested?
873
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 lock %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 lock %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 lock %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
898 %endif ; RT_ARCH_AMD64
899 %endif ; locked
900%endmacro
901IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
902IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
903IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
904IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
905
906;;
907; Macro for implementing a bit search operator.
908;
909; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
910; system where the 64-bit accesses requires hand coding.
911;
912; All the functions takes a pointer to the destination memory operand in A0,
913; the source register operand in A1 and a pointer to eflags in A2.
914;
915; In the ZF case the destination register is 'undefined', however it seems that
916; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
917; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
918; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
919; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
920;
921; @param 1 The instruction mnemonic.
922; @param 2 The modified flags.
923; @param 3 The undefined flags.
924; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
925;
926%macro IEMIMPL_BIT_OP2 4
927BEGINCODE
928BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
929 PROLOGUE_3_ARGS
930 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
931 %1 T0_16, A1_16
932%if %4 != 0
933 jz .unchanged_dst
934%endif
935 mov [A0], T0_16
936.unchanged_dst:
937 IEM_SAVE_FLAGS A2, %2, %3
938 EPILOGUE_3_ARGS
939ENDPROC iemAImpl_ %+ %1 %+ _u16
940
941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
942 PROLOGUE_3_ARGS
943 %1 T1_16, A1_16
944%if %4 != 0
945 jz .unchanged_dst
946%endif
947 mov [A0], T1_16
948 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
949 EPILOGUE_3_ARGS
950.unchanged_dst:
951 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
952 EPILOGUE_3_ARGS
953ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
954
955BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
956 PROLOGUE_3_ARGS
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
966
967
968BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
969 PROLOGUE_3_ARGS
970 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
971 %1 T0_32, A1_32
972%if %4 != 0
973 jz .unchanged_dst
974%endif
975 mov [A0], T0_32
976.unchanged_dst:
977 IEM_SAVE_FLAGS A2, %2, %3
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u32
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
982 PROLOGUE_3_ARGS
983 %1 T1_32, A1_32
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T1_32
988 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
989 EPILOGUE_3_ARGS
990.unchanged_dst:
991 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
992 EPILOGUE_3_ARGS
993ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
994
995BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
996 PROLOGUE_3_ARGS
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1006
1007
1008 %ifdef RT_ARCH_AMD64
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1011 PROLOGUE_3_ARGS
1012 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1013 %1 T0, A1
1014%if %4 != 0
1015 jz .unchanged_dst
1016%endif
1017 mov [A0], T0
1018.unchanged_dst:
1019 IEM_SAVE_FLAGS A2, %2, %3
1020 EPILOGUE_3_ARGS_EX 8
1021ENDPROC iemAImpl_ %+ %1 %+ _u64
1022
1023BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1024 PROLOGUE_3_ARGS
1025 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1026 %1 T1, A1
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T1
1031 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1032 EPILOGUE_3_ARGS
1033.unchanged_dst:
1034 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1035 EPILOGUE_3_ARGS
1036ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1037
1038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1039 PROLOGUE_3_ARGS
1040 %1 T0, A1
1041%if %4 != 0
1042 jz .unchanged_dst
1043%endif
1044 mov [A0], T0
1045.unchanged_dst:
1046 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1047 EPILOGUE_3_ARGS_EX 8
1048ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1049
1050 %endif ; RT_ARCH_AMD64
1051%endmacro
1052
1053IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1054IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1055IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1056IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1057
1058
1059;;
1060; Macro for implementing POPCNT.
1061;
1062; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1063; system where the 64-bit accesses requires hand coding.
1064;
1065; All the functions takes a pointer to the destination memory operand in A0,
1066; the source register operand in A1 and a pointer to eflags in A2.
1067;
1068; ASSUMES Intel and AMD set EFLAGS the same way.
1069;
1070; ASSUMES the instruction does not support memory destination.
1071;
1072; @param 1 The instruction mnemonic.
1073; @param 2 The modified flags.
1074; @param 3 The undefined flags.
1075;
1076%macro IEMIMPL_BIT_OP3 3
1077BEGINCODE
1078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1079 PROLOGUE_3_ARGS
1080 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1081 %1 T0_16, A1_16
1082 mov [A0], T0_16
1083 IEM_SAVE_FLAGS A2, %2, %3
1084 EPILOGUE_3_ARGS
1085ENDPROC iemAImpl_ %+ %1 %+ _u16
1086
1087BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1088 PROLOGUE_3_ARGS
1089 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1090 %1 T0_32, A1_32
1091 mov [A0], T0_32
1092 IEM_SAVE_FLAGS A2, %2, %3
1093 EPILOGUE_3_ARGS
1094ENDPROC iemAImpl_ %+ %1 %+ _u32
1095
1096 %ifdef RT_ARCH_AMD64
1097BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1098 PROLOGUE_3_ARGS
1099 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1100 %1 T0, A1
1101 mov [A0], T0
1102 IEM_SAVE_FLAGS A2, %2, %3
1103 EPILOGUE_3_ARGS_EX 8
1104ENDPROC iemAImpl_ %+ %1 %+ _u64
1105 %endif ; RT_ARCH_AMD64
1106%endmacro
1107IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1108
1109
1110;
1111; IMUL is also a similar but yet different case (no lock, no mem dst).
1112; The rDX:rAX variant of imul is handled together with mul further down.
1113;
1114BEGINCODE
1115; @param 1 EFLAGS that are modified.
1116; @param 2 Undefined EFLAGS.
1117; @param 3 Function suffix.
1118; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1119; 2 for AMD (set AF, clear PF, ZF and SF).
1120%macro IEMIMPL_IMUL_TWO 4
1121BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1122 PROLOGUE_3_ARGS
1123 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1124 imul A1_16, word [A0]
1125 mov [A0], A1_16
1126 %if %4 != 1
1127 IEM_SAVE_FLAGS A2, %1, %2
1128 %else
1129 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1130 %endif
1131 EPILOGUE_3_ARGS
1132ENDPROC iemAImpl_imul_two_u16 %+ %3
1133
1134BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1135 PROLOGUE_3_ARGS
1136 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1137 imul A1_32, dword [A0]
1138 mov [A0], A1_32
1139 %if %4 != 1
1140 IEM_SAVE_FLAGS A2, %1, %2
1141 %else
1142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1143 %endif
1144 EPILOGUE_3_ARGS
1145ENDPROC iemAImpl_imul_two_u32 %+ %3
1146
1147 %ifdef RT_ARCH_AMD64
1148BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1149 PROLOGUE_3_ARGS
1150 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1151 imul A1, qword [A0]
1152 mov [A0], A1
1153 %if %4 != 1
1154 IEM_SAVE_FLAGS A2, %1, %2
1155 %else
1156 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1157 %endif
1158 EPILOGUE_3_ARGS_EX 8
1159ENDPROC iemAImpl_imul_two_u64 %+ %3
1160 %endif ; RT_ARCH_AMD64
1161%endmacro
1162IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1163IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1164IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1165
1166
1167;
1168; XCHG for memory operands. This implies locking. No flag changes.
1169;
1170; Each function takes two arguments, first the pointer to the memory,
1171; then the pointer to the register. They all return void.
1172;
1173BEGINCODE
1174BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1175 PROLOGUE_2_ARGS
1176 mov T0_8, [A1]
1177 xchg [A0], T0_8
1178 mov [A1], T0_8
1179 EPILOGUE_2_ARGS
1180ENDPROC iemAImpl_xchg_u8_locked
1181
1182BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1183 PROLOGUE_2_ARGS
1184 mov T0_16, [A1]
1185 xchg [A0], T0_16
1186 mov [A1], T0_16
1187 EPILOGUE_2_ARGS
1188ENDPROC iemAImpl_xchg_u16_locked
1189
1190BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1191 PROLOGUE_2_ARGS
1192 mov T0_32, [A1]
1193 xchg [A0], T0_32
1194 mov [A1], T0_32
1195 EPILOGUE_2_ARGS
1196ENDPROC iemAImpl_xchg_u32_locked
1197
1198%ifdef RT_ARCH_AMD64
1199BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1200 PROLOGUE_2_ARGS
1201 mov T0, [A1]
1202 xchg [A0], T0
1203 mov [A1], T0
1204 EPILOGUE_2_ARGS
1205ENDPROC iemAImpl_xchg_u64_locked
1206%endif
1207
1208; Unlocked variants for fDisregardLock mode.
1209
1210BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1211 PROLOGUE_2_ARGS
1212 mov T0_8, [A1]
1213 mov T1_8, [A0]
1214 mov [A0], T0_8
1215 mov [A1], T1_8
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u8_unlocked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_16, [A1]
1222 mov T1_16, [A0]
1223 mov [A0], T0_16
1224 mov [A1], T1_16
1225 EPILOGUE_2_ARGS
1226ENDPROC iemAImpl_xchg_u16_unlocked
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_32, [A1]
1231 mov T1_32, [A0]
1232 mov [A0], T0_32
1233 mov [A1], T1_32
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u32_unlocked
1236
1237%ifdef RT_ARCH_AMD64
1238BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1239 PROLOGUE_2_ARGS
1240 mov T0, [A1]
1241 mov T1, [A0]
1242 mov [A0], T0
1243 mov [A1], T1
1244 EPILOGUE_2_ARGS
1245ENDPROC iemAImpl_xchg_u64_unlocked
1246%endif
1247
1248
1249;
1250; XADD for memory operands.
1251;
1252; Each function takes three arguments, first the pointer to the
1253; memory/register, then the pointer to the register, and finally a pointer to
1254; eflags. They all return void.
1255;
1256BEGINCODE
1257BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1258 PROLOGUE_3_ARGS
1259 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1260 mov T0_8, [A1]
1261 xadd [A0], T0_8
1262 mov [A1], T0_8
1263 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1264 EPILOGUE_3_ARGS
1265ENDPROC iemAImpl_xadd_u8
1266
1267BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1268 PROLOGUE_3_ARGS
1269 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1270 mov T0_16, [A1]
1271 xadd [A0], T0_16
1272 mov [A1], T0_16
1273 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1274 EPILOGUE_3_ARGS
1275ENDPROC iemAImpl_xadd_u16
1276
1277BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1278 PROLOGUE_3_ARGS
1279 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1280 mov T0_32, [A1]
1281 xadd [A0], T0_32
1282 mov [A1], T0_32
1283 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1284 EPILOGUE_3_ARGS
1285ENDPROC iemAImpl_xadd_u32
1286
1287%ifdef RT_ARCH_AMD64
1288BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1289 PROLOGUE_3_ARGS
1290 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1291 mov T0, [A1]
1292 xadd [A0], T0
1293 mov [A1], T0
1294 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 EPILOGUE_3_ARGS
1296ENDPROC iemAImpl_xadd_u64
1297%endif ; RT_ARCH_AMD64
1298
1299BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1300 PROLOGUE_3_ARGS
1301 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 mov T0_8, [A1]
1303 lock xadd [A0], T0_8
1304 mov [A1], T0_8
1305 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 EPILOGUE_3_ARGS
1307ENDPROC iemAImpl_xadd_u8_locked
1308
1309BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1310 PROLOGUE_3_ARGS
1311 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1312 mov T0_16, [A1]
1313 lock xadd [A0], T0_16
1314 mov [A1], T0_16
1315 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1316 EPILOGUE_3_ARGS
1317ENDPROC iemAImpl_xadd_u16_locked
1318
1319BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1320 PROLOGUE_3_ARGS
1321 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1322 mov T0_32, [A1]
1323 lock xadd [A0], T0_32
1324 mov [A1], T0_32
1325 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 EPILOGUE_3_ARGS
1327ENDPROC iemAImpl_xadd_u32_locked
1328
1329%ifdef RT_ARCH_AMD64
1330BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1331 PROLOGUE_3_ARGS
1332 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1333 mov T0, [A1]
1334 lock xadd [A0], T0
1335 mov [A1], T0
1336 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 EPILOGUE_3_ARGS
1338ENDPROC iemAImpl_xadd_u64_locked
1339%endif ; RT_ARCH_AMD64
1340
1341
1342;
1343; CMPXCHG8B.
1344;
1345; These are tricky register wise, so the code is duplicated for each calling
1346; convention.
1347;
1348; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1349;
1350; C-proto:
1351; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1352; uint32_t *pEFlags));
1353;
1354; Note! Identical to iemAImpl_cmpxchg16b.
1355;
1356BEGINCODE
1357BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1358%ifdef RT_ARCH_AMD64
1359 %ifdef ASM_CALL64_MSC
1360 push rbx
1361
1362 mov r11, rdx ; pu64EaxEdx (is also T1)
1363 mov r10, rcx ; pu64Dst
1364
1365 mov ebx, [r8]
1366 mov ecx, [r8 + 4]
1367 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1368 mov eax, [r11]
1369 mov edx, [r11 + 4]
1370
1371 lock cmpxchg8b [r10]
1372
1373 mov [r11], eax
1374 mov [r11 + 4], edx
1375 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1376
1377 pop rbx
1378 ret
1379 %else
1380 push rbx
1381
1382 mov r10, rcx ; pEFlags
1383 mov r11, rdx ; pu64EbxEcx (is also T1)
1384
1385 mov ebx, [r11]
1386 mov ecx, [r11 + 4]
1387 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1388 mov eax, [rsi]
1389 mov edx, [rsi + 4]
1390
1391 lock cmpxchg8b [rdi]
1392
1393 mov [rsi], eax
1394 mov [rsi + 4], edx
1395 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1396
1397 pop rbx
1398 ret
1399
1400 %endif
1401%else
1402 push esi
1403 push edi
1404 push ebx
1405 push ebp
1406
1407 mov edi, ecx ; pu64Dst
1408 mov esi, edx ; pu64EaxEdx
1409 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1410 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1411
1412 mov ebx, [ecx]
1413 mov ecx, [ecx + 4]
1414 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1415 mov eax, [esi]
1416 mov edx, [esi + 4]
1417
1418 lock cmpxchg8b [edi]
1419
1420 mov [esi], eax
1421 mov [esi + 4], edx
1422 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1423
1424 pop ebp
1425 pop ebx
1426 pop edi
1427 pop esi
1428 ret 8
1429%endif
1430ENDPROC iemAImpl_cmpxchg8b
1431
1432BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1433 ; Lazy bird always lock prefixes cmpxchg8b.
1434 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1435ENDPROC iemAImpl_cmpxchg8b_locked
1436
1437%ifdef RT_ARCH_AMD64
1438
1439;
1440; CMPXCHG16B.
1441;
1442; These are tricky register wise, so the code is duplicated for each calling
1443; convention.
1444;
1445; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1446;
1447; C-proto:
1448; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1449; uint32_t *pEFlags));
1450;
1451; Note! Identical to iemAImpl_cmpxchg8b.
1452;
1453BEGINCODE
1454BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1455 %ifdef ASM_CALL64_MSC
1456 push rbx
1457
1458 mov r11, rdx ; pu64RaxRdx (is also T1)
1459 mov r10, rcx ; pu64Dst
1460
1461 mov rbx, [r8]
1462 mov rcx, [r8 + 8]
1463 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1464 mov rax, [r11]
1465 mov rdx, [r11 + 8]
1466
1467 lock cmpxchg16b [r10]
1468
1469 mov [r11], rax
1470 mov [r11 + 8], rdx
1471 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1472
1473 pop rbx
1474 ret
1475 %else
1476 push rbx
1477
1478 mov r10, rcx ; pEFlags
1479 mov r11, rdx ; pu64RbxRcx (is also T1)
1480
1481 mov rbx, [r11]
1482 mov rcx, [r11 + 8]
1483 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1484 mov rax, [rsi]
1485 mov rdx, [rsi + 8]
1486
1487 lock cmpxchg16b [rdi]
1488
1489 mov [rsi], rax
1490 mov [rsi + 8], rdx
1491 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1492
1493 pop rbx
1494 ret
1495
1496 %endif
1497ENDPROC iemAImpl_cmpxchg16b
1498
1499BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1500 ; Lazy bird always lock prefixes cmpxchg16b.
1501 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1502ENDPROC iemAImpl_cmpxchg16b_locked
1503
1504%endif ; RT_ARCH_AMD64
1505
1506
1507;
1508; CMPXCHG.
1509;
1510; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1511;
1512; C-proto:
1513; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1514;
1515BEGINCODE
1516%macro IEMIMPL_CMPXCHG 2
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1518 PROLOGUE_4_ARGS
1519 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1520 mov al, [A1]
1521 %1 cmpxchg [A0], A2_8
1522 mov [A1], al
1523 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1524 EPILOGUE_4_ARGS
1525ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1526
1527BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1528 PROLOGUE_4_ARGS
1529 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1530 mov ax, [A1]
1531 %1 cmpxchg [A0], A2_16
1532 mov [A1], ax
1533 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1534 EPILOGUE_4_ARGS
1535ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1536
1537BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1538 PROLOGUE_4_ARGS
1539 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1540 mov eax, [A1]
1541 %1 cmpxchg [A0], A2_32
1542 mov [A1], eax
1543 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1544 EPILOGUE_4_ARGS
1545ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1546
1547BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1548%ifdef RT_ARCH_AMD64
1549 PROLOGUE_4_ARGS
1550 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1551 mov rax, [A1]
1552 %1 cmpxchg [A0], A2
1553 mov [A1], rax
1554 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1555 EPILOGUE_4_ARGS
1556%else
1557 ;
1558 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1559 ;
1560 push esi
1561 push edi
1562 push ebx
1563 push ebp
1564
1565 mov edi, ecx ; pu64Dst
1566 mov esi, edx ; pu64Rax
1567 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1568 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1569
1570 mov ebx, [ecx]
1571 mov ecx, [ecx + 4]
1572 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1573 mov eax, [esi]
1574 mov edx, [esi + 4]
1575
1576 lock cmpxchg8b [edi]
1577
1578 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1579 jz .cmpxchg8b_not_equal
1580 cmp eax, eax ; just set the other flags.
1581.store:
1582 mov [esi], eax
1583 mov [esi + 4], edx
1584 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1585
1586 pop ebp
1587 pop ebx
1588 pop edi
1589 pop esi
1590 ret 8
1591
1592.cmpxchg8b_not_equal:
1593 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1594 jne .store
1595 cmp [esi], eax
1596 jmp .store
1597
1598%endif
1599ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1600%endmacro ; IEMIMPL_CMPXCHG
1601
1602IEMIMPL_CMPXCHG , ,
1603IEMIMPL_CMPXCHG lock, _locked
1604
1605;;
1606; Macro for implementing a unary operator.
1607;
1608; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1609; variants, except on 32-bit system where the 64-bit accesses requires hand
1610; coding.
1611;
1612; All the functions takes a pointer to the destination memory operand in A0,
1613; the source register operand in A1 and a pointer to eflags in A2.
1614;
1615; @param 1 The instruction mnemonic.
1616; @param 2 The modified flags.
1617; @param 3 The undefined flags.
1618;
1619%macro IEMIMPL_UNARY_OP 3
1620BEGINCODE
1621BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1622 PROLOGUE_2_ARGS
1623 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1624 %1 byte [A0]
1625 IEM_SAVE_FLAGS A1, %2, %3
1626 EPILOGUE_2_ARGS
1627ENDPROC iemAImpl_ %+ %1 %+ _u8
1628
1629BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1630 PROLOGUE_2_ARGS
1631 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1632 lock %1 byte [A0]
1633 IEM_SAVE_FLAGS A1, %2, %3
1634 EPILOGUE_2_ARGS
1635ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1636
1637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1638 PROLOGUE_2_ARGS
1639 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1640 %1 word [A0]
1641 IEM_SAVE_FLAGS A1, %2, %3
1642 EPILOGUE_2_ARGS
1643ENDPROC iemAImpl_ %+ %1 %+ _u16
1644
1645BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1646 PROLOGUE_2_ARGS
1647 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1648 lock %1 word [A0]
1649 IEM_SAVE_FLAGS A1, %2, %3
1650 EPILOGUE_2_ARGS
1651ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1652
1653BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1654 PROLOGUE_2_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1656 %1 dword [A0]
1657 IEM_SAVE_FLAGS A1, %2, %3
1658 EPILOGUE_2_ARGS
1659ENDPROC iemAImpl_ %+ %1 %+ _u32
1660
1661BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1662 PROLOGUE_2_ARGS
1663 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1664 lock %1 dword [A0]
1665 IEM_SAVE_FLAGS A1, %2, %3
1666 EPILOGUE_2_ARGS
1667ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1668
1669 %ifdef RT_ARCH_AMD64
1670BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1671 PROLOGUE_2_ARGS
1672 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1673 %1 qword [A0]
1674 IEM_SAVE_FLAGS A1, %2, %3
1675 EPILOGUE_2_ARGS
1676ENDPROC iemAImpl_ %+ %1 %+ _u64
1677
1678BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1679 PROLOGUE_2_ARGS
1680 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1681 lock %1 qword [A0]
1682 IEM_SAVE_FLAGS A1, %2, %3
1683 EPILOGUE_2_ARGS
1684ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1685 %endif ; RT_ARCH_AMD64
1686
1687%endmacro
1688
1689IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1690IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1691IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1692IEMIMPL_UNARY_OP not, 0, 0
1693
1694
1695;
1696; BSWAP. No flag changes.
1697;
1698; Each function takes one argument, pointer to the value to bswap
1699; (input/output). They all return void.
1700;
1701BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1702 PROLOGUE_1_ARGS
1703 mov T0_32, [A0] ; just in case any of the upper bits are used.
1704 db 66h
1705 bswap T0_32
1706 mov [A0], T0_32
1707 EPILOGUE_1_ARGS
1708ENDPROC iemAImpl_bswap_u16
1709
1710BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1711 PROLOGUE_1_ARGS
1712 mov T0_32, [A0]
1713 bswap T0_32
1714 mov [A0], T0_32
1715 EPILOGUE_1_ARGS
1716ENDPROC iemAImpl_bswap_u32
1717
1718BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1719%ifdef RT_ARCH_AMD64
1720 PROLOGUE_1_ARGS
1721 mov T0, [A0]
1722 bswap T0
1723 mov [A0], T0
1724 EPILOGUE_1_ARGS
1725%else
1726 PROLOGUE_1_ARGS
1727 mov T0, [A0]
1728 mov T1, [A0 + 4]
1729 bswap T0
1730 bswap T1
1731 mov [A0 + 4], T0
1732 mov [A0], T1
1733 EPILOGUE_1_ARGS
1734%endif
1735ENDPROC iemAImpl_bswap_u64
1736
1737
1738;;
1739; Macro for implementing a shift operation.
1740;
1741; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1742; 32-bit system where the 64-bit accesses requires hand coding.
1743;
1744; All the functions takes a pointer to the destination memory operand in A0,
1745; the shift count in A1 and a pointer to eflags in A2.
1746;
1747; @param 1 The instruction mnemonic.
1748; @param 2 The modified flags.
1749; @param 3 The undefined flags.
1750;
1751; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1752;
1753; @note the _intel and _amd variants are implemented in C.
1754;
1755%macro IEMIMPL_SHIFT_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1758 PROLOGUE_3_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1760 %ifdef ASM_CALL64_GCC
1761 mov cl, A1_8
1762 %1 byte [A0], cl
1763 %else
1764 xchg A1, A0
1765 %1 byte [A1], cl
1766 %endif
1767 IEM_SAVE_FLAGS A2, %2, %3
1768 EPILOGUE_3_ARGS
1769ENDPROC iemAImpl_ %+ %1 %+ _u8
1770
1771BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1772 PROLOGUE_3_ARGS
1773 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1774 %ifdef ASM_CALL64_GCC
1775 mov cl, A1_8
1776 %1 word [A0], cl
1777 %else
1778 xchg A1, A0
1779 %1 word [A1], cl
1780 %endif
1781 IEM_SAVE_FLAGS A2, %2, %3
1782 EPILOGUE_3_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1786 PROLOGUE_3_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1788 %ifdef ASM_CALL64_GCC
1789 mov cl, A1_8
1790 %1 dword [A0], cl
1791 %else
1792 xchg A1, A0
1793 %1 dword [A1], cl
1794 %endif
1795 IEM_SAVE_FLAGS A2, %2, %3
1796 EPILOGUE_3_ARGS
1797ENDPROC iemAImpl_ %+ %1 %+ _u32
1798
1799 %ifdef RT_ARCH_AMD64
1800BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1801 PROLOGUE_3_ARGS
1802 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1803 %ifdef ASM_CALL64_GCC
1804 mov cl, A1_8
1805 %1 qword [A0], cl
1806 %else
1807 xchg A1, A0
1808 %1 qword [A1], cl
1809 %endif
1810 IEM_SAVE_FLAGS A2, %2, %3
1811 EPILOGUE_3_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813 %endif ; RT_ARCH_AMD64
1814
1815%endmacro
1816
1817IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1818IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1819IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1820IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1821IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1822IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1823IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1824
1825
1826;;
1827; Macro for implementing a double precision shift operation.
1828;
1829; This will generate code for the 16, 32 and 64 bit accesses, except on
1830; 32-bit system where the 64-bit accesses requires hand coding.
1831;
1832; The functions takes the destination operand (r/m) in A0, the source (reg) in
1833; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1834;
1835; @param 1 The instruction mnemonic.
1836; @param 2 The modified flags.
1837; @param 3 The undefined flags.
1838;
1839; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1840;
1841; @note the _intel and _amd variants are implemented in C.
1842;
1843%macro IEMIMPL_SHIFT_DBL_OP 3
1844BEGINCODE
1845BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1846 PROLOGUE_4_ARGS
1847 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1848 %ifdef ASM_CALL64_GCC
1849 xchg A3, A2
1850 %1 [A0], A1_16, cl
1851 xchg A3, A2
1852 %else
1853 xchg A0, A2
1854 %1 [A2], A1_16, cl
1855 %endif
1856 IEM_SAVE_FLAGS A3, %2, %3
1857 EPILOGUE_4_ARGS
1858ENDPROC iemAImpl_ %+ %1 %+ _u16
1859
1860BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1861 PROLOGUE_4_ARGS
1862 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1863 %ifdef ASM_CALL64_GCC
1864 xchg A3, A2
1865 %1 [A0], A1_32, cl
1866 xchg A3, A2
1867 %else
1868 xchg A0, A2
1869 %1 [A2], A1_32, cl
1870 %endif
1871 IEM_SAVE_FLAGS A3, %2, %3
1872 EPILOGUE_4_ARGS
1873ENDPROC iemAImpl_ %+ %1 %+ _u32
1874
1875 %ifdef RT_ARCH_AMD64
1876BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1877 PROLOGUE_4_ARGS
1878 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1879 %ifdef ASM_CALL64_GCC
1880 xchg A3, A2
1881 %1 [A0], A1, cl
1882 xchg A3, A2
1883 %else
1884 xchg A0, A2
1885 %1 [A2], A1, cl
1886 %endif
1887 IEM_SAVE_FLAGS A3, %2, %3
1888 EPILOGUE_4_ARGS_EX 12
1889ENDPROC iemAImpl_ %+ %1 %+ _u64
1890 %endif ; RT_ARCH_AMD64
1891
1892%endmacro
1893
1894IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1895IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1896
1897
1898;;
1899; Macro for implementing a multiplication operations.
1900;
1901; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1902; 32-bit system where the 64-bit accesses requires hand coding.
1903;
1904; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1905; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1906; pointer to eflags in A3.
1907;
1908; The functions all return 0 so the caller can be used for div/idiv as well as
1909; for the mul/imul implementation.
1910;
1911; @param 1 The instruction mnemonic.
1912; @param 2 The modified flags.
1913; @param 3 The undefined flags.
1914; @param 4 Name suffix.
1915; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1916;
1917; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1918;
1919%macro IEMIMPL_MUL_OP 5
1920BEGINCODE
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 mov al, [A0]
1925 %1 A1_8
1926 mov [A0], ax
1927 %if %5 != 1
1928 IEM_SAVE_FLAGS A2, %2, %3
1929 %else
1930 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1931 %endif
1932 xor eax, eax
1933 EPILOGUE_3_ARGS
1934ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1935
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1937 PROLOGUE_4_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1939 mov ax, [A0]
1940 %ifdef ASM_CALL64_GCC
1941 %1 A2_16
1942 mov [A0], ax
1943 mov [A1], dx
1944 %else
1945 mov T1, A1
1946 %1 A2_16
1947 mov [A0], ax
1948 mov [T1], dx
1949 %endif
1950 %if %5 != 1
1951 IEM_SAVE_FLAGS A3, %2, %3
1952 %else
1953 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1954 %endif
1955 xor eax, eax
1956 EPILOGUE_4_ARGS
1957ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1958
1959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1960 PROLOGUE_4_ARGS
1961 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1962 mov eax, [A0]
1963 %ifdef ASM_CALL64_GCC
1964 %1 A2_32
1965 mov [A0], eax
1966 mov [A1], edx
1967 %else
1968 mov T1, A1
1969 %1 A2_32
1970 mov [A0], eax
1971 mov [T1], edx
1972 %endif
1973 %if %5 != 1
1974 IEM_SAVE_FLAGS A3, %2, %3
1975 %else
1976 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1977 %endif
1978 xor eax, eax
1979 EPILOGUE_4_ARGS
1980ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1981
1982 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
1983BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
1984 PROLOGUE_4_ARGS
1985 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1986 mov rax, [A0]
1987 %ifdef ASM_CALL64_GCC
1988 %1 A2
1989 mov [A0], rax
1990 mov [A1], rdx
1991 %else
1992 mov T1, A1
1993 %1 A2
1994 mov [A0], rax
1995 mov [T1], rdx
1996 %endif
1997 %if %5 != 1
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 %else
2000 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2001 %endif
2002 xor eax, eax
2003 EPILOGUE_4_ARGS_EX 12
2004ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2005 %endif ; !RT_ARCH_AMD64
2006
2007%endmacro
2008
2009IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2010IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2011IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2012IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2013IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2014IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2015
2016
2017BEGINCODE
2018;;
2019; Worker function for negating a 32-bit number in T1:T0
2020; @uses None (T0,T1)
2021BEGINPROC iemAImpl_negate_T0_T1_u32
2022 push 0
2023 push 0
2024 xchg T0_32, [xSP]
2025 xchg T1_32, [xSP + xCB]
2026 sub T0_32, [xSP]
2027 sbb T1_32, [xSP + xCB]
2028 add xSP, xCB*2
2029 ret
2030ENDPROC iemAImpl_negate_T0_T1_u32
2031
2032%ifdef RT_ARCH_AMD64
2033;;
2034; Worker function for negating a 64-bit number in T1:T0
2035; @uses None (T0,T1)
2036BEGINPROC iemAImpl_negate_T0_T1_u64
2037 push 0
2038 push 0
2039 xchg T0, [xSP]
2040 xchg T1, [xSP + xCB]
2041 sub T0, [xSP]
2042 sbb T1, [xSP + xCB]
2043 add xSP, xCB*2
2044 ret
2045ENDPROC iemAImpl_negate_T0_T1_u64
2046%endif
2047
2048
2049;;
2050; Macro for implementing a division operations.
2051;
2052; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2053; 32-bit system where the 64-bit accesses requires hand coding.
2054;
2055; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2056; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2057; pointer to eflags in A3.
2058;
2059; The functions all return 0 on success and -1 if a divide error should be
2060; raised by the caller.
2061;
2062; @param 1 The instruction mnemonic.
2063; @param 2 The modified flags.
2064; @param 3 The undefined flags.
2065; @param 4 1 if signed, 0 if unsigned.
2066; @param 5 Function suffix.
2067; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2068; 2 for AMD (set AF, clear PF, ZF and SF).
2069;
2070; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2071;
2072%macro IEMIMPL_DIV_OP 6
2073BEGINCODE
2074BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2075 PROLOGUE_3_ARGS
2076
2077 ; div by chainsaw check.
2078 test A1_8, A1_8
2079 jz .div_zero
2080
2081 ; Overflow check - unsigned division is simple to verify, haven't
2082 ; found a simple way to check signed division yet unfortunately.
2083 %if %4 == 0
2084 cmp [A0 + 1], A1_8
2085 jae .div_overflow
2086 %else
2087 mov T0_16, [A0] ; T0 = dividend
2088 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2089 test A1_8, A1_8
2090 js .divisor_negative
2091 test T0_16, T0_16
2092 jns .both_positive
2093 neg T0_16
2094.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2095 push T0 ; Start off like unsigned below.
2096 shr T0_16, 7
2097 cmp T0_8, A1_8
2098 pop T0
2099 jb .div_no_overflow
2100 ja .div_overflow
2101 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2102 cmp T0_8, A1_8
2103 jae .div_overflow
2104 jmp .div_no_overflow
2105
2106.divisor_negative:
2107 neg A1_8
2108 test T0_16, T0_16
2109 jns .one_of_each
2110 neg T0_16
2111.both_positive: ; Same as unsigned shifted by sign indicator bit.
2112 shr T0_16, 7
2113 cmp T0_8, A1_8
2114 jae .div_overflow
2115.div_no_overflow:
2116 mov A1, T1 ; restore divisor
2117 %endif
2118
2119 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2120 mov ax, [A0]
2121 %1 A1_8
2122 mov [A0], ax
2123 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2124 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2125 %else
2126 IEM_SAVE_FLAGS A2, %2, %3
2127 %endif
2128 xor eax, eax
2129
2130.return:
2131 EPILOGUE_3_ARGS
2132
2133.div_zero:
2134.div_overflow:
2135 mov eax, -1
2136 jmp .return
2137ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2138
2139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2140 PROLOGUE_4_ARGS
2141
2142 ; div by chainsaw check.
2143 test A2_16, A2_16
2144 jz .div_zero
2145
2146 ; Overflow check - unsigned division is simple to verify, haven't
2147 ; found a simple way to check signed division yet unfortunately.
2148 %if %4 == 0
2149 cmp [A1], A2_16
2150 jae .div_overflow
2151 %else
2152 mov T0_16, [A1]
2153 shl T0_32, 16
2154 mov T0_16, [A0] ; T0 = dividend
2155 mov T1, A2 ; T1 = divisor
2156 test T1_16, T1_16
2157 js .divisor_negative
2158 test T0_32, T0_32
2159 jns .both_positive
2160 neg T0_32
2161.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2162 push T0 ; Start off like unsigned below.
2163 shr T0_32, 15
2164 cmp T0_16, T1_16
2165 pop T0
2166 jb .div_no_overflow
2167 ja .div_overflow
2168 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2169 cmp T0_16, T1_16
2170 jae .div_overflow
2171 jmp .div_no_overflow
2172
2173.divisor_negative:
2174 neg T1_16
2175 test T0_32, T0_32
2176 jns .one_of_each
2177 neg T0_32
2178.both_positive: ; Same as unsigned shifted by sign indicator bit.
2179 shr T0_32, 15
2180 cmp T0_16, T1_16
2181 jae .div_overflow
2182.div_no_overflow:
2183 %endif
2184
2185 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2186 %ifdef ASM_CALL64_GCC
2187 mov T1, A2
2188 mov ax, [A0]
2189 mov dx, [A1]
2190 %1 T1_16
2191 mov [A0], ax
2192 mov [A1], dx
2193 %else
2194 mov T1, A1
2195 mov ax, [A0]
2196 mov dx, [T1]
2197 %1 A2_16
2198 mov [A0], ax
2199 mov [T1], dx
2200 %endif
2201 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2202 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2203 %else
2204 IEM_SAVE_FLAGS A3, %2, %3
2205 %endif
2206 xor eax, eax
2207
2208.return:
2209 EPILOGUE_4_ARGS
2210
2211.div_zero:
2212.div_overflow:
2213 mov eax, -1
2214 jmp .return
2215ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2216
2217BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2218 PROLOGUE_4_ARGS
2219
2220 ; div by chainsaw check.
2221 test A2_32, A2_32
2222 jz .div_zero
2223
2224 ; Overflow check - unsigned division is simple to verify, haven't
2225 ; found a simple way to check signed division yet unfortunately.
2226 %if %4 == 0
2227 cmp [A1], A2_32
2228 jae .div_overflow
2229 %else
2230 push A2 ; save A2 so we modify it (we out of regs on x86).
2231 mov T0_32, [A0] ; T0 = dividend low
2232 mov T1_32, [A1] ; T1 = dividend high
2233 test A2_32, A2_32
2234 js .divisor_negative
2235 test T1_32, T1_32
2236 jns .both_positive
2237 call NAME(iemAImpl_negate_T0_T1_u32)
2238.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2239 push T0 ; Start off like unsigned below.
2240 shl T1_32, 1
2241 shr T0_32, 31
2242 or T1_32, T0_32
2243 cmp T1_32, A2_32
2244 pop T0
2245 jb .div_no_overflow
2246 ja .div_overflow
2247 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2248 cmp T0_32, A2_32
2249 jae .div_overflow
2250 jmp .div_no_overflow
2251
2252.divisor_negative:
2253 neg A2_32
2254 test T1_32, T1_32
2255 jns .one_of_each
2256 call NAME(iemAImpl_negate_T0_T1_u32)
2257.both_positive: ; Same as unsigned shifted by sign indicator bit.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 jae .div_overflow
2263.div_no_overflow:
2264 pop A2
2265 %endif
2266
2267 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2268 mov eax, [A0]
2269 %ifdef ASM_CALL64_GCC
2270 mov T1, A2
2271 mov eax, [A0]
2272 mov edx, [A1]
2273 %1 T1_32
2274 mov [A0], eax
2275 mov [A1], edx
2276 %else
2277 mov T1, A1
2278 mov eax, [A0]
2279 mov edx, [T1]
2280 %1 A2_32
2281 mov [A0], eax
2282 mov [T1], edx
2283 %endif
2284 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2285 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2286 %else
2287 IEM_SAVE_FLAGS A3, %2, %3
2288 %endif
2289 xor eax, eax
2290
2291.return:
2292 EPILOGUE_4_ARGS
2293
2294.div_overflow:
2295 %if %4 != 0
2296 pop A2
2297 %endif
2298.div_zero:
2299 mov eax, -1
2300 jmp .return
2301ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2302
2303 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2305 PROLOGUE_4_ARGS
2306
2307 test A2, A2
2308 jz .div_zero
2309 %if %4 == 0
2310 cmp [A1], A2
2311 jae .div_overflow
2312 %else
2313 push A2 ; save A2 so we modify it (we out of regs on x86).
2314 mov T0, [A0] ; T0 = dividend low
2315 mov T1, [A1] ; T1 = dividend high
2316 test A2, A2
2317 js .divisor_negative
2318 test T1, T1
2319 jns .both_positive
2320 call NAME(iemAImpl_negate_T0_T1_u64)
2321.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2322 push T0 ; Start off like unsigned below.
2323 shl T1, 1
2324 shr T0, 63
2325 or T1, T0
2326 cmp T1, A2
2327 pop T0
2328 jb .div_no_overflow
2329 ja .div_overflow
2330 mov T1, 0x7fffffffffffffff
2331 and T0, T1 ; Special case for covering (divisor - 1).
2332 cmp T0, A2
2333 jae .div_overflow
2334 jmp .div_no_overflow
2335
2336.divisor_negative:
2337 neg A2
2338 test T1, T1
2339 jns .one_of_each
2340 call NAME(iemAImpl_negate_T0_T1_u64)
2341.both_positive: ; Same as unsigned shifted by sign indicator bit.
2342 shl T1, 1
2343 shr T0, 63
2344 or T1, T0
2345 cmp T1, A2
2346 jae .div_overflow
2347.div_no_overflow:
2348 pop A2
2349 %endif
2350
2351 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2352 mov rax, [A0]
2353 %ifdef ASM_CALL64_GCC
2354 mov T1, A2
2355 mov rax, [A0]
2356 mov rdx, [A1]
2357 %1 T1
2358 mov [A0], rax
2359 mov [A1], rdx
2360 %else
2361 mov T1, A1
2362 mov rax, [A0]
2363 mov rdx, [T1]
2364 %1 A2
2365 mov [A0], rax
2366 mov [T1], rdx
2367 %endif
2368 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2369 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2370 %else
2371 IEM_SAVE_FLAGS A3, %2, %3
2372 %endif
2373 xor eax, eax
2374
2375.return:
2376 EPILOGUE_4_ARGS_EX 12
2377
2378.div_overflow:
2379 %if %4 != 0
2380 pop A2
2381 %endif
2382.div_zero:
2383 mov eax, -1
2384 jmp .return
2385ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2386 %endif ; !RT_ARCH_AMD64
2387
2388%endmacro
2389
2390IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2391IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2392IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2393IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2394IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2395IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2396
2397
2398;;
2399; Macro for implementing memory fence operation.
2400;
2401; No return value, no operands or anything.
2402;
2403; @param 1 The instruction.
2404;
2405%macro IEMIMPL_MEM_FENCE 1
2406BEGINCODE
2407BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2408 %1
2409 ret
2410ENDPROC iemAImpl_ %+ %1
2411%endmacro
2412
2413IEMIMPL_MEM_FENCE lfence
2414IEMIMPL_MEM_FENCE sfence
2415IEMIMPL_MEM_FENCE mfence
2416
2417;;
2418; Alternative for non-SSE2 host.
2419;
2420BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2421 push xAX
2422 xchg xAX, [xSP]
2423 add xSP, xCB
2424 ret
2425ENDPROC iemAImpl_alt_mem_fence
2426
2427
2428;;
2429; Initialize the FPU for the actual instruction being emulated, this means
2430; loading parts of the guest's control word and status word.
2431;
2432; @uses 24 bytes of stack. T0, T1
2433; @param 1 Expression giving the address of the FXSTATE of the guest.
2434;
2435%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2436 fnstenv [xSP]
2437
2438 ; FCW - for exception, precision and rounding control.
2439 movzx T0, word [%1 + X86FXSTATE.FCW]
2440 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2441 mov [xSP + X86FSTENV32P.FCW], T0_16
2442
2443 ; FSW - for undefined C0, C1, C2, and C3.
2444 movzx T1, word [%1 + X86FXSTATE.FSW]
2445 and T1, X86_FSW_C_MASK
2446 movzx T0, word [xSP + X86FSTENV32P.FSW]
2447 and T0, X86_FSW_TOP_MASK
2448 or T0, T1
2449 mov [xSP + X86FSTENV32P.FSW], T0_16
2450
2451 fldenv [xSP]
2452%endmacro
2453
2454
2455;;
2456; Initialize the FPU for the actual instruction being emulated, this means
2457; loading parts of the guest's control word, status word, and update the
2458; tag word for the top register if it's empty.
2459;
2460; ASSUMES actual TOP=7
2461;
2462; @uses 24 bytes of stack. T0, T1
2463; @param 1 Expression giving the address of the FXSTATE of the guest.
2464;
2465%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2466 fnstenv [xSP]
2467
2468 ; FCW - for exception, precision and rounding control.
2469 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2470 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2471 mov [xSP + X86FSTENV32P.FCW], T0_16
2472
2473 ; FSW - for undefined C0, C1, C2, and C3.
2474 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2475 and T1_32, X86_FSW_C_MASK
2476 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2477 and T0_32, X86_FSW_TOP_MASK
2478 or T0_32, T1_32
2479 mov [xSP + X86FSTENV32P.FSW], T0_16
2480
2481 ; FTW - Only for ST0 (in/out).
2482 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2483 shr T1_32, X86_FSW_TOP_SHIFT
2484 and T1_32, X86_FSW_TOP_SMASK
2485 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2486 jc %%st0_not_empty
2487 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2488%%st0_not_empty:
2489
2490 fldenv [xSP]
2491%endmacro
2492
2493
2494;;
2495; Need to move this as well somewhere better?
2496;
2497struc IEMFPURESULT
2498 .r80Result resw 5
2499 .FSW resw 1
2500endstruc
2501
2502
2503;;
2504; Need to move this as well somewhere better?
2505;
2506struc IEMFPURESULTTWO
2507 .r80Result1 resw 5
2508 .FSW resw 1
2509 .r80Result2 resw 5
2510endstruc
2511
2512
2513;
2514;---------------------- 16-bit signed integer operations ----------------------
2515;
2516
2517
2518;;
2519; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2520;
2521; @param A0 FPU context (fxsave).
2522; @param A1 Pointer to a IEMFPURESULT for the output.
2523; @param A2 Pointer to the 16-bit floating point value to convert.
2524;
2525BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2526 PROLOGUE_3_ARGS
2527 sub xSP, 20h
2528
2529 fninit
2530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2531 fild word [A2]
2532
2533 fnstsw word [A1 + IEMFPURESULT.FSW]
2534 fnclex
2535 fstp tword [A1 + IEMFPURESULT.r80Result]
2536
2537 fninit
2538 add xSP, 20h
2539 EPILOGUE_3_ARGS
2540ENDPROC iemAImpl_fild_r80_from_i16
2541
2542
2543;;
2544; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2545;
2546; @param A0 FPU context (fxsave).
2547; @param A1 Where to return the output FSW.
2548; @param A2 Where to store the 16-bit signed integer value.
2549; @param A3 Pointer to the 80-bit value.
2550;
2551BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2552 PROLOGUE_4_ARGS
2553 sub xSP, 20h
2554
2555 fninit
2556 fld tword [A3]
2557 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2558 fistp word [A2]
2559
2560 fnstsw word [A1]
2561
2562 fninit
2563 add xSP, 20h
2564 EPILOGUE_4_ARGS
2565ENDPROC iemAImpl_fist_r80_to_i16
2566
2567
2568;;
2569; Store a 80-bit floating point value (register) as a 16-bit signed integer
2570; (memory) with truncation.
2571;
2572; @param A0 FPU context (fxsave).
2573; @param A1 Where to return the output FSW.
2574; @param A2 Where to store the 16-bit signed integer value.
2575; @param A3 Pointer to the 80-bit value.
2576;
2577BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2578 PROLOGUE_4_ARGS
2579 sub xSP, 20h
2580
2581 fninit
2582 fld tword [A3]
2583 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2584 fisttp word [A2]
2585
2586 fnstsw word [A1]
2587
2588 fninit
2589 add xSP, 20h
2590 EPILOGUE_4_ARGS
2591ENDPROC iemAImpl_fistt_r80_to_i16
2592
2593
2594;;
2595; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2596;
2597; @param 1 The instruction
2598;
2599; @param A0 FPU context (fxsave).
2600; @param A1 Pointer to a IEMFPURESULT for the output.
2601; @param A2 Pointer to the 80-bit value.
2602; @param A3 Pointer to the 16-bit value.
2603;
2604%macro IEMIMPL_FPU_R80_BY_I16 1
2605BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2606 PROLOGUE_4_ARGS
2607 sub xSP, 20h
2608
2609 fninit
2610 fld tword [A2]
2611 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2612 %1 word [A3]
2613
2614 fnstsw word [A1 + IEMFPURESULT.FSW]
2615 fnclex
2616 fstp tword [A1 + IEMFPURESULT.r80Result]
2617
2618 fninit
2619 add xSP, 20h
2620 EPILOGUE_4_ARGS
2621ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2622%endmacro
2623
2624IEMIMPL_FPU_R80_BY_I16 fiadd
2625IEMIMPL_FPU_R80_BY_I16 fimul
2626IEMIMPL_FPU_R80_BY_I16 fisub
2627IEMIMPL_FPU_R80_BY_I16 fisubr
2628IEMIMPL_FPU_R80_BY_I16 fidiv
2629IEMIMPL_FPU_R80_BY_I16 fidivr
2630
2631
2632;;
2633; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2634; only returning FSW.
2635;
2636; @param 1 The instruction
2637;
2638; @param A0 FPU context (fxsave).
2639; @param A1 Where to store the output FSW.
2640; @param A2 Pointer to the 80-bit value.
2641; @param A3 Pointer to the 64-bit value.
2642;
2643%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2644BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2645 PROLOGUE_4_ARGS
2646 sub xSP, 20h
2647
2648 fninit
2649 fld tword [A2]
2650 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2651 %1 word [A3]
2652
2653 fnstsw word [A1]
2654
2655 fninit
2656 add xSP, 20h
2657 EPILOGUE_4_ARGS
2658ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2659%endmacro
2660
2661IEMIMPL_FPU_R80_BY_I16_FSW ficom
2662
2663
2664
2665;
2666;---------------------- 32-bit signed integer operations ----------------------
2667;
2668
2669
2670;;
2671; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Pointer to a IEMFPURESULT for the output.
2675; @param A2 Pointer to the 32-bit floating point value to convert.
2676;
2677BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2678 PROLOGUE_3_ARGS
2679 sub xSP, 20h
2680
2681 fninit
2682 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2683 fild dword [A2]
2684
2685 fnstsw word [A1 + IEMFPURESULT.FSW]
2686 fnclex
2687 fstp tword [A1 + IEMFPURESULT.r80Result]
2688
2689 fninit
2690 add xSP, 20h
2691 EPILOGUE_3_ARGS
2692ENDPROC iemAImpl_fild_r80_from_i32
2693
2694
2695;;
2696; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2697;
2698; @param A0 FPU context (fxsave).
2699; @param A1 Where to return the output FSW.
2700; @param A2 Where to store the 32-bit signed integer value.
2701; @param A3 Pointer to the 80-bit value.
2702;
2703BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2704 PROLOGUE_4_ARGS
2705 sub xSP, 20h
2706
2707 fninit
2708 fld tword [A3]
2709 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2710 fistp dword [A2]
2711
2712 fnstsw word [A1]
2713
2714 fninit
2715 add xSP, 20h
2716 EPILOGUE_4_ARGS
2717ENDPROC iemAImpl_fist_r80_to_i32
2718
2719
2720;;
2721; Store a 80-bit floating point value (register) as a 32-bit signed integer
2722; (memory) with truncation.
2723;
2724; @param A0 FPU context (fxsave).
2725; @param A1 Where to return the output FSW.
2726; @param A2 Where to store the 32-bit signed integer value.
2727; @param A3 Pointer to the 80-bit value.
2728;
2729BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2730 PROLOGUE_4_ARGS
2731 sub xSP, 20h
2732
2733 fninit
2734 fld tword [A3]
2735 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2736 fisttp dword [A2]
2737
2738 fnstsw word [A1]
2739
2740 fninit
2741 add xSP, 20h
2742 EPILOGUE_4_ARGS
2743ENDPROC iemAImpl_fistt_r80_to_i32
2744
2745
2746;;
2747; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2748;
2749; @param 1 The instruction
2750;
2751; @param A0 FPU context (fxsave).
2752; @param A1 Pointer to a IEMFPURESULT for the output.
2753; @param A2 Pointer to the 80-bit value.
2754; @param A3 Pointer to the 32-bit value.
2755;
2756%macro IEMIMPL_FPU_R80_BY_I32 1
2757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2758 PROLOGUE_4_ARGS
2759 sub xSP, 20h
2760
2761 fninit
2762 fld tword [A2]
2763 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2764 %1 dword [A3]
2765
2766 fnstsw word [A1 + IEMFPURESULT.FSW]
2767 fnclex
2768 fstp tword [A1 + IEMFPURESULT.r80Result]
2769
2770 fninit
2771 add xSP, 20h
2772 EPILOGUE_4_ARGS
2773ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2774%endmacro
2775
2776IEMIMPL_FPU_R80_BY_I32 fiadd
2777IEMIMPL_FPU_R80_BY_I32 fimul
2778IEMIMPL_FPU_R80_BY_I32 fisub
2779IEMIMPL_FPU_R80_BY_I32 fisubr
2780IEMIMPL_FPU_R80_BY_I32 fidiv
2781IEMIMPL_FPU_R80_BY_I32 fidivr
2782
2783
2784;;
2785; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2786; only returning FSW.
2787;
2788; @param 1 The instruction
2789;
2790; @param A0 FPU context (fxsave).
2791; @param A1 Where to store the output FSW.
2792; @param A2 Pointer to the 80-bit value.
2793; @param A3 Pointer to the 64-bit value.
2794;
2795%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2796BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2797 PROLOGUE_4_ARGS
2798 sub xSP, 20h
2799
2800 fninit
2801 fld tword [A2]
2802 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2803 %1 dword [A3]
2804
2805 fnstsw word [A1]
2806
2807 fninit
2808 add xSP, 20h
2809 EPILOGUE_4_ARGS
2810ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2811%endmacro
2812
2813IEMIMPL_FPU_R80_BY_I32_FSW ficom
2814
2815
2816
2817;
2818;---------------------- 64-bit signed integer operations ----------------------
2819;
2820
2821
2822;;
2823; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Pointer to a IEMFPURESULT for the output.
2827; @param A2 Pointer to the 64-bit floating point value to convert.
2828;
2829BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2830 PROLOGUE_3_ARGS
2831 sub xSP, 20h
2832
2833 fninit
2834 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2835 fild qword [A2]
2836
2837 fnstsw word [A1 + IEMFPURESULT.FSW]
2838 fnclex
2839 fstp tword [A1 + IEMFPURESULT.r80Result]
2840
2841 fninit
2842 add xSP, 20h
2843 EPILOGUE_3_ARGS
2844ENDPROC iemAImpl_fild_r80_from_i64
2845
2846
2847;;
2848; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2849;
2850; @param A0 FPU context (fxsave).
2851; @param A1 Where to return the output FSW.
2852; @param A2 Where to store the 64-bit signed integer value.
2853; @param A3 Pointer to the 80-bit value.
2854;
2855BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2856 PROLOGUE_4_ARGS
2857 sub xSP, 20h
2858
2859 fninit
2860 fld tword [A3]
2861 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2862 fistp qword [A2]
2863
2864 fnstsw word [A1]
2865
2866 fninit
2867 add xSP, 20h
2868 EPILOGUE_4_ARGS
2869ENDPROC iemAImpl_fist_r80_to_i64
2870
2871
2872;;
2873; Store a 80-bit floating point value (register) as a 64-bit signed integer
2874; (memory) with truncation.
2875;
2876; @param A0 FPU context (fxsave).
2877; @param A1 Where to return the output FSW.
2878; @param A2 Where to store the 64-bit signed integer value.
2879; @param A3 Pointer to the 80-bit value.
2880;
2881BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2882 PROLOGUE_4_ARGS
2883 sub xSP, 20h
2884
2885 fninit
2886 fld tword [A3]
2887 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2888 fisttp qword [A2]
2889
2890 fnstsw word [A1]
2891
2892 fninit
2893 add xSP, 20h
2894 EPILOGUE_4_ARGS
2895ENDPROC iemAImpl_fistt_r80_to_i64
2896
2897
2898
2899;
2900;---------------------- 32-bit floating point operations ----------------------
2901;
2902
2903;;
2904; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2905;
2906; @param A0 FPU context (fxsave).
2907; @param A1 Pointer to a IEMFPURESULT for the output.
2908; @param A2 Pointer to the 32-bit floating point value to convert.
2909;
2910BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2911 PROLOGUE_3_ARGS
2912 sub xSP, 20h
2913
2914 fninit
2915 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2916 fld dword [A2]
2917
2918 fnstsw word [A1 + IEMFPURESULT.FSW]
2919 fnclex
2920 fstp tword [A1 + IEMFPURESULT.r80Result]
2921
2922 fninit
2923 add xSP, 20h
2924 EPILOGUE_3_ARGS
2925ENDPROC iemAImpl_fld_r80_from_r32
2926
2927
2928;;
2929; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2930;
2931; @param A0 FPU context (fxsave).
2932; @param A1 Where to return the output FSW.
2933; @param A2 Where to store the 32-bit value.
2934; @param A3 Pointer to the 80-bit value.
2935;
2936BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2937 PROLOGUE_4_ARGS
2938 sub xSP, 20h
2939
2940 fninit
2941 fld tword [A3]
2942 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2943 fst dword [A2]
2944
2945 fnstsw word [A1]
2946
2947 fninit
2948 add xSP, 20h
2949 EPILOGUE_4_ARGS
2950ENDPROC iemAImpl_fst_r80_to_r32
2951
2952
2953;;
2954; FPU instruction working on one 80-bit and one 32-bit floating point value.
2955;
2956; @param 1 The instruction
2957;
2958; @param A0 FPU context (fxsave).
2959; @param A1 Pointer to a IEMFPURESULT for the output.
2960; @param A2 Pointer to the 80-bit value.
2961; @param A3 Pointer to the 32-bit value.
2962;
2963%macro IEMIMPL_FPU_R80_BY_R32 1
2964BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2965 PROLOGUE_4_ARGS
2966 sub xSP, 20h
2967
2968 fninit
2969 fld tword [A2]
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 %1 dword [A3]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_4_ARGS
2980ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2981%endmacro
2982
2983IEMIMPL_FPU_R80_BY_R32 fadd
2984IEMIMPL_FPU_R80_BY_R32 fmul
2985IEMIMPL_FPU_R80_BY_R32 fsub
2986IEMIMPL_FPU_R80_BY_R32 fsubr
2987IEMIMPL_FPU_R80_BY_R32 fdiv
2988IEMIMPL_FPU_R80_BY_R32 fdivr
2989
2990
2991;;
2992; FPU instruction working on one 80-bit and one 32-bit floating point value,
2993; only returning FSW.
2994;
2995; @param 1 The instruction
2996;
2997; @param A0 FPU context (fxsave).
2998; @param A1 Where to store the output FSW.
2999; @param A2 Pointer to the 80-bit value.
3000; @param A3 Pointer to the 64-bit value.
3001;
3002%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3004 PROLOGUE_4_ARGS
3005 sub xSP, 20h
3006
3007 fninit
3008 fld tword [A2]
3009 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3010 %1 dword [A3]
3011
3012 fnstsw word [A1]
3013
3014 fninit
3015 add xSP, 20h
3016 EPILOGUE_4_ARGS
3017ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3018%endmacro
3019
3020IEMIMPL_FPU_R80_BY_R32_FSW fcom
3021
3022
3023
3024;
3025;---------------------- 64-bit floating point operations ----------------------
3026;
3027
3028;;
3029; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3030;
3031; @param A0 FPU context (fxsave).
3032; @param A1 Pointer to a IEMFPURESULT for the output.
3033; @param A2 Pointer to the 64-bit floating point value to convert.
3034;
3035BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3036 PROLOGUE_3_ARGS
3037 sub xSP, 20h
3038
3039 fninit
3040 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3041 fld qword [A2]
3042
3043 fnstsw word [A1 + IEMFPURESULT.FSW]
3044 fnclex
3045 fstp tword [A1 + IEMFPURESULT.r80Result]
3046
3047 fninit
3048 add xSP, 20h
3049 EPILOGUE_3_ARGS
3050ENDPROC iemAImpl_fld_r80_from_r64
3051
3052
3053;;
3054; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3055;
3056; @param A0 FPU context (fxsave).
3057; @param A1 Where to return the output FSW.
3058; @param A2 Where to store the 64-bit value.
3059; @param A3 Pointer to the 80-bit value.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3062 PROLOGUE_4_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 fld tword [A3]
3067 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3068 fst qword [A2]
3069
3070 fnstsw word [A1]
3071
3072 fninit
3073 add xSP, 20h
3074 EPILOGUE_4_ARGS
3075ENDPROC iemAImpl_fst_r80_to_r64
3076
3077
3078;;
3079; FPU instruction working on one 80-bit and one 64-bit floating point value.
3080;
3081; @param 1 The instruction
3082;
3083; @param A0 FPU context (fxsave).
3084; @param A1 Pointer to a IEMFPURESULT for the output.
3085; @param A2 Pointer to the 80-bit value.
3086; @param A3 Pointer to the 64-bit value.
3087;
3088%macro IEMIMPL_FPU_R80_BY_R64 1
3089BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3090 PROLOGUE_4_ARGS
3091 sub xSP, 20h
3092
3093 fninit
3094 fld tword [A2]
3095 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3096 %1 qword [A3]
3097
3098 fnstsw word [A1 + IEMFPURESULT.FSW]
3099 fnclex
3100 fstp tword [A1 + IEMFPURESULT.r80Result]
3101
3102 fninit
3103 add xSP, 20h
3104 EPILOGUE_4_ARGS
3105ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3106%endmacro
3107
3108IEMIMPL_FPU_R80_BY_R64 fadd
3109IEMIMPL_FPU_R80_BY_R64 fmul
3110IEMIMPL_FPU_R80_BY_R64 fsub
3111IEMIMPL_FPU_R80_BY_R64 fsubr
3112IEMIMPL_FPU_R80_BY_R64 fdiv
3113IEMIMPL_FPU_R80_BY_R64 fdivr
3114
3115;;
3116; FPU instruction working on one 80-bit and one 64-bit floating point value,
3117; only returning FSW.
3118;
3119; @param 1 The instruction
3120;
3121; @param A0 FPU context (fxsave).
3122; @param A1 Where to store the output FSW.
3123; @param A2 Pointer to the 80-bit value.
3124; @param A3 Pointer to the 64-bit value.
3125;
3126%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3127BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3128 PROLOGUE_4_ARGS
3129 sub xSP, 20h
3130
3131 fninit
3132 fld tword [A2]
3133 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3134 %1 qword [A3]
3135
3136 fnstsw word [A1]
3137
3138 fninit
3139 add xSP, 20h
3140 EPILOGUE_4_ARGS
3141ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3142%endmacro
3143
3144IEMIMPL_FPU_R80_BY_R64_FSW fcom
3145
3146
3147
3148;
3149;---------------------- 80-bit floating point operations ----------------------
3150;
3151
3152;;
3153; Loads a 80-bit floating point register value from memory.
3154;
3155; @param A0 FPU context (fxsave).
3156; @param A1 Pointer to a IEMFPURESULT for the output.
3157; @param A2 Pointer to the 80-bit floating point value to load.
3158;
3159BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3160 PROLOGUE_3_ARGS
3161 sub xSP, 20h
3162
3163 fninit
3164 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3165 fld tword [A2]
3166
3167 fnstsw word [A1 + IEMFPURESULT.FSW]
3168 fnclex
3169 fstp tword [A1 + IEMFPURESULT.r80Result]
3170
3171 fninit
3172 add xSP, 20h
3173 EPILOGUE_3_ARGS
3174ENDPROC iemAImpl_fld_r80_from_r80
3175
3176
3177;;
3178; Store a 80-bit floating point register to memory
3179;
3180; @param A0 FPU context (fxsave).
3181; @param A1 Where to return the output FSW.
3182; @param A2 Where to store the 80-bit value.
3183; @param A3 Pointer to the 80-bit register value.
3184;
3185BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3186 PROLOGUE_4_ARGS
3187 sub xSP, 20h
3188
3189 fninit
3190 fld tword [A3]
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fstp tword [A2]
3193
3194 fnstsw word [A1]
3195
3196 fninit
3197 add xSP, 20h
3198 EPILOGUE_4_ARGS
3199ENDPROC iemAImpl_fst_r80_to_r80
3200
3201
3202;;
3203; Loads an 80-bit floating point register value in BCD format from memory.
3204;
3205; @param A0 FPU context (fxsave).
3206; @param A1 Pointer to a IEMFPURESULT for the output.
3207; @param A2 Pointer to the 80-bit BCD value to load.
3208;
3209BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3210 PROLOGUE_3_ARGS
3211 sub xSP, 20h
3212
3213 fninit
3214 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3215 fbld tword [A2]
3216
3217 fnstsw word [A1 + IEMFPURESULT.FSW]
3218 fnclex
3219 fstp tword [A1 + IEMFPURESULT.r80Result]
3220
3221 fninit
3222 add xSP, 20h
3223 EPILOGUE_3_ARGS
3224ENDPROC iemAImpl_fld_r80_from_d80
3225
3226
3227;;
3228; Store a 80-bit floating point register to memory as BCD
3229;
3230; @param A0 FPU context (fxsave).
3231; @param A1 Where to return the output FSW.
3232; @param A2 Where to store the 80-bit BCD value.
3233; @param A3 Pointer to the 80-bit register value.
3234;
3235BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3236 PROLOGUE_4_ARGS
3237 sub xSP, 20h
3238
3239 fninit
3240 fld tword [A3]
3241 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3242 fbstp tword [A2]
3243
3244 fnstsw word [A1]
3245
3246 fninit
3247 add xSP, 20h
3248 EPILOGUE_4_ARGS
3249ENDPROC iemAImpl_fst_r80_to_d80
3250
3251
3252;;
3253; FPU instruction working on two 80-bit floating point values.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Pointer to a IEMFPURESULT for the output.
3259; @param A2 Pointer to the first 80-bit value (ST0)
3260; @param A3 Pointer to the second 80-bit value (STn).
3261;
3262%macro IEMIMPL_FPU_R80_BY_R80 2
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A3]
3269 fld tword [A2]
3270 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3271 %1 %2
3272
3273 fnstsw word [A1 + IEMFPURESULT.FSW]
3274 fnclex
3275 fstp tword [A1 + IEMFPURESULT.r80Result]
3276
3277 fninit
3278 add xSP, 20h
3279 EPILOGUE_4_ARGS
3280ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3281%endmacro
3282
3283IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3284IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3285IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3286IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3287IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3288IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3289IEMIMPL_FPU_R80_BY_R80 fprem, {}
3290IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3291IEMIMPL_FPU_R80_BY_R80 fscale, {}
3292
3293
3294;;
3295; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3296; storing the result in ST1 and popping the stack.
3297;
3298; @param 1 The instruction
3299;
3300; @param A0 FPU context (fxsave).
3301; @param A1 Pointer to a IEMFPURESULT for the output.
3302; @param A2 Pointer to the first 80-bit value (ST1).
3303; @param A3 Pointer to the second 80-bit value (ST0).
3304;
3305%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3307 PROLOGUE_4_ARGS
3308 sub xSP, 20h
3309
3310 fninit
3311 fld tword [A2]
3312 fld tword [A3]
3313 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3314 %1
3315
3316 fnstsw word [A1 + IEMFPURESULT.FSW]
3317 fnclex
3318 fstp tword [A1 + IEMFPURESULT.r80Result]
3319
3320 fninit
3321 add xSP, 20h
3322 EPILOGUE_4_ARGS
3323ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3324%endmacro
3325
3326IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3327IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3328IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3329
3330
3331;;
3332; FPU instruction working on two 80-bit floating point values, only
3333; returning FSW.
3334;
3335; @param 1 The instruction
3336;
3337; @param A0 FPU context (fxsave).
3338; @param A1 Pointer to a uint16_t for the resulting FSW.
3339; @param A2 Pointer to the first 80-bit value.
3340; @param A3 Pointer to the second 80-bit value.
3341;
3342%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3343BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3344 PROLOGUE_4_ARGS
3345 sub xSP, 20h
3346
3347 fninit
3348 fld tword [A3]
3349 fld tword [A2]
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 %1 st0, st1
3352
3353 fnstsw word [A1]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_FSW fcom
3362IEMIMPL_FPU_R80_BY_R80_FSW fucom
3363
3364
3365;;
3366; FPU instruction working on two 80-bit floating point values,
3367; returning FSW and EFLAGS (eax).
3368;
3369; @param 1 The instruction
3370;
3371; @returns EFLAGS in EAX.
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st1
3387
3388 fnstsw word [A1]
3389 pushf
3390 pop xAX
3391
3392 fninit
3393 add xSP, 20h
3394 EPILOGUE_4_ARGS
3395ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3396%endmacro
3397
3398IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3399IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3400
3401
3402;;
3403; FPU instruction working on one 80-bit floating point value.
3404;
3405; @param 1 The instruction
3406;
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a IEMFPURESULT for the output.
3409; @param A2 Pointer to the 80-bit value.
3410;
3411%macro IEMIMPL_FPU_R80 1
3412BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3413 PROLOGUE_3_ARGS
3414 sub xSP, 20h
3415
3416 fninit
3417 fld tword [A2]
3418 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3419 %1
3420
3421 fnstsw word [A1 + IEMFPURESULT.FSW]
3422 fnclex
3423 fstp tword [A1 + IEMFPURESULT.r80Result]
3424
3425 fninit
3426 add xSP, 20h
3427 EPILOGUE_3_ARGS
3428ENDPROC iemAImpl_ %+ %1 %+ _r80
3429%endmacro
3430
3431IEMIMPL_FPU_R80 fchs
3432IEMIMPL_FPU_R80 fabs
3433IEMIMPL_FPU_R80 f2xm1
3434IEMIMPL_FPU_R80 fsqrt
3435IEMIMPL_FPU_R80 frndint
3436IEMIMPL_FPU_R80 fsin
3437IEMIMPL_FPU_R80 fcos
3438
3439
3440;;
3441; FPU instruction working on one 80-bit floating point value, only
3442; returning FSW.
3443;
3444; @param 1 The instruction
3445; @param 2 Non-zero to also restore FTW.
3446;
3447; @param A0 FPU context (fxsave).
3448; @param A1 Pointer to a uint16_t for the resulting FSW.
3449; @param A2 Pointer to the 80-bit value.
3450;
3451%macro IEMIMPL_FPU_R80_FSW 2
3452BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3453 PROLOGUE_3_ARGS
3454 sub xSP, 20h
3455
3456 fninit
3457 fld tword [A2]
3458%if %2 != 0
3459 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3460%else
3461 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3462%endif
3463 %1
3464
3465 fnstsw word [A1]
3466
3467 fninit
3468 add xSP, 20h
3469 EPILOGUE_3_ARGS
3470ENDPROC iemAImpl_ %+ %1 %+ _r80
3471%endmacro
3472
3473IEMIMPL_FPU_R80_FSW ftst, 0
3474IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3475
3476
3477
3478;;
3479; FPU instruction loading a 80-bit floating point constant.
3480;
3481; @param 1 The instruction
3482;
3483; @param A0 FPU context (fxsave).
3484; @param A1 Pointer to a IEMFPURESULT for the output.
3485;
3486%macro IEMIMPL_FPU_R80_CONST 1
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3488 PROLOGUE_2_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3493 %1
3494
3495 fnstsw word [A1 + IEMFPURESULT.FSW]
3496 fnclex
3497 fstp tword [A1 + IEMFPURESULT.r80Result]
3498
3499 fninit
3500 add xSP, 20h
3501 EPILOGUE_2_ARGS
3502ENDPROC iemAImpl_ %+ %1 %+
3503%endmacro
3504
3505IEMIMPL_FPU_R80_CONST fld1
3506IEMIMPL_FPU_R80_CONST fldl2t
3507IEMIMPL_FPU_R80_CONST fldl2e
3508IEMIMPL_FPU_R80_CONST fldpi
3509IEMIMPL_FPU_R80_CONST fldlg2
3510IEMIMPL_FPU_R80_CONST fldln2
3511IEMIMPL_FPU_R80_CONST fldz
3512
3513
3514;;
3515; FPU instruction working on one 80-bit floating point value, outputing two.
3516;
3517; @param 1 The instruction
3518;
3519; @param A0 FPU context (fxsave).
3520; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3521; @param A2 Pointer to the 80-bit value.
3522;
3523%macro IEMIMPL_FPU_R80_R80 1
3524BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3525 PROLOGUE_3_ARGS
3526 sub xSP, 20h
3527
3528 fninit
3529 fld tword [A2]
3530 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3531 %1
3532
3533 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3534 fnclex
3535 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3536 fnclex
3537 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3538
3539 fninit
3540 add xSP, 20h
3541 EPILOGUE_3_ARGS
3542ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3543%endmacro
3544
3545IEMIMPL_FPU_R80_R80 fptan
3546IEMIMPL_FPU_R80_R80 fxtract
3547IEMIMPL_FPU_R80_R80 fsincos
3548
3549
3550
3551
3552;---------------------- SSE and MMX Operations ----------------------
3553
3554;; @todo what do we need to do for MMX?
3555%macro IEMIMPL_MMX_PROLOGUE 0
3556%endmacro
3557%macro IEMIMPL_MMX_EPILOGUE 0
3558%endmacro
3559
3560;; @todo what do we need to do for SSE?
3561%macro IEMIMPL_SSE_PROLOGUE 0
3562%endmacro
3563%macro IEMIMPL_SSE_EPILOGUE 0
3564%endmacro
3565
3566;; @todo what do we need to do for AVX?
3567%macro IEMIMPL_AVX_PROLOGUE 0
3568%endmacro
3569%macro IEMIMPL_AVX_EPILOGUE 0
3570%endmacro
3571
3572
3573;;
3574; Media instruction working on two full sized registers.
3575;
3576; @param 1 The instruction
3577; @param 2 Whether there is an MMX variant (1) or not (0).
3578;
3579; @param A0 FPU context (fxsave).
3580; @param A1 Pointer to the first media register size operand (input/output).
3581; @param A2 Pointer to the second media register size operand (input).
3582;
3583%macro IEMIMPL_MEDIA_F2 2
3584%if %2 != 0
3585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3586 PROLOGUE_3_ARGS
3587 IEMIMPL_MMX_PROLOGUE
3588
3589 movq mm0, [A1]
3590 movq mm1, [A2]
3591 %1 mm0, mm1
3592 movq [A1], mm0
3593
3594 IEMIMPL_MMX_EPILOGUE
3595 EPILOGUE_3_ARGS
3596ENDPROC iemAImpl_ %+ %1 %+ _u64
3597%endif
3598
3599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3600 PROLOGUE_3_ARGS
3601 IEMIMPL_SSE_PROLOGUE
3602
3603 movdqu xmm0, [A1]
3604 movdqu xmm1, [A2]
3605 %1 xmm0, xmm1
3606 movdqu [A1], xmm0
3607
3608 IEMIMPL_SSE_EPILOGUE
3609 EPILOGUE_3_ARGS
3610ENDPROC iemAImpl_ %+ %1 %+ _u128
3611%endmacro
3612
3613IEMIMPL_MEDIA_F2 pshufb, 1
3614IEMIMPL_MEDIA_F2 pand, 1
3615IEMIMPL_MEDIA_F2 pandn, 1
3616IEMIMPL_MEDIA_F2 por, 1
3617IEMIMPL_MEDIA_F2 pxor, 1
3618IEMIMPL_MEDIA_F2 pcmpeqb, 1
3619IEMIMPL_MEDIA_F2 pcmpeqw, 1
3620IEMIMPL_MEDIA_F2 pcmpeqd, 1
3621IEMIMPL_MEDIA_F2 pcmpeqq, 0
3622IEMIMPL_MEDIA_F2 pcmpgtb, 1
3623IEMIMPL_MEDIA_F2 pcmpgtw, 1
3624IEMIMPL_MEDIA_F2 pcmpgtd, 1
3625IEMIMPL_MEDIA_F2 pcmpgtq, 0
3626IEMIMPL_MEDIA_F2 paddb, 1
3627IEMIMPL_MEDIA_F2 paddw, 1
3628IEMIMPL_MEDIA_F2 paddd, 1
3629IEMIMPL_MEDIA_F2 paddq, 1
3630IEMIMPL_MEDIA_F2 paddsb, 1
3631IEMIMPL_MEDIA_F2 paddsw, 1
3632IEMIMPL_MEDIA_F2 paddusb, 1
3633IEMIMPL_MEDIA_F2 paddusw, 1
3634IEMIMPL_MEDIA_F2 psubb, 1
3635IEMIMPL_MEDIA_F2 psubw, 1
3636IEMIMPL_MEDIA_F2 psubd, 1
3637IEMIMPL_MEDIA_F2 psubq, 1
3638IEMIMPL_MEDIA_F2 psubsb, 1
3639IEMIMPL_MEDIA_F2 psubsw, 1
3640IEMIMPL_MEDIA_F2 psubusb, 1
3641IEMIMPL_MEDIA_F2 psubusw, 1
3642IEMIMPL_MEDIA_F2 pmullw, 1
3643IEMIMPL_MEDIA_F2 pmulld, 0
3644IEMIMPL_MEDIA_F2 pmulhw, 1
3645IEMIMPL_MEDIA_F2 pmaddwd, 1
3646IEMIMPL_MEDIA_F2 pminub, 1
3647IEMIMPL_MEDIA_F2 pminuw, 0
3648IEMIMPL_MEDIA_F2 pminud, 0
3649IEMIMPL_MEDIA_F2 pminsb, 0
3650IEMIMPL_MEDIA_F2 pminsw, 1
3651IEMIMPL_MEDIA_F2 pminsd, 0
3652IEMIMPL_MEDIA_F2 pmaxub, 1
3653IEMIMPL_MEDIA_F2 pmaxuw, 0
3654IEMIMPL_MEDIA_F2 pmaxud, 0
3655IEMIMPL_MEDIA_F2 pmaxsb, 0
3656IEMIMPL_MEDIA_F2 pmaxsw, 1
3657IEMIMPL_MEDIA_F2 pmaxsd, 0
3658IEMIMPL_MEDIA_F2 pabsb, 1
3659IEMIMPL_MEDIA_F2 pabsw, 1
3660IEMIMPL_MEDIA_F2 pabsd, 1
3661IEMIMPL_MEDIA_F2 psignb, 1
3662IEMIMPL_MEDIA_F2 psignw, 1
3663IEMIMPL_MEDIA_F2 psignd, 1
3664IEMIMPL_MEDIA_F2 phaddw, 1
3665IEMIMPL_MEDIA_F2 phaddd, 1
3666IEMIMPL_MEDIA_F2 phsubw, 1
3667IEMIMPL_MEDIA_F2 phsubd, 1
3668IEMIMPL_MEDIA_F2 phaddsw, 1
3669IEMIMPL_MEDIA_F2 phsubsw, 1
3670IEMIMPL_MEDIA_F2 pmaddubsw, 1
3671IEMIMPL_MEDIA_F2 pmulhrsw, 1
3672IEMIMPL_MEDIA_F2 pmuludq, 1
3673
3674
3675;;
3676; Media instruction working on two full sized registers, but no FXSAVE state argument.
3677;
3678; @param 1 The instruction
3679; @param 2 Whether there is an MMX variant (1) or not (0).
3680;
3681; @param A0 Pointer to the first media register size operand (input/output).
3682; @param A1 Pointer to the second media register size operand (input).
3683;
3684%macro IEMIMPL_MEDIA_OPT_F2 2
3685%if %2 != 0
3686BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3687 PROLOGUE_2_ARGS
3688 IEMIMPL_MMX_PROLOGUE
3689
3690 movq mm0, [A0]
3691 movq mm1, [A1]
3692 %1 mm0, mm1
3693 movq [A0], mm0
3694
3695 IEMIMPL_MMX_EPILOGUE
3696 EPILOGUE_2_ARGS
3697ENDPROC iemAImpl_ %+ %1 %+ _u64
3698%endif
3699
3700BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3701 PROLOGUE_2_ARGS
3702 IEMIMPL_SSE_PROLOGUE
3703
3704 movdqu xmm0, [A0]
3705 movdqu xmm1, [A1]
3706 %1 xmm0, xmm1
3707 movdqu [A0], xmm0
3708
3709 IEMIMPL_SSE_EPILOGUE
3710 EPILOGUE_2_ARGS
3711ENDPROC iemAImpl_ %+ %1 %+ _u128
3712%endmacro
3713
3714IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3715IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3716IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3717IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3718IEMIMPL_MEDIA_OPT_F2 psllw, 1
3719IEMIMPL_MEDIA_OPT_F2 pslld, 1
3720IEMIMPL_MEDIA_OPT_F2 psllq, 1
3721IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3722IEMIMPL_MEDIA_OPT_F2 psrld, 1
3723IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3724IEMIMPL_MEDIA_OPT_F2 psraw, 1
3725IEMIMPL_MEDIA_OPT_F2 psrad, 1
3726IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3727IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3728IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3729IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3730IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3731IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3732IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3733IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3734IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3735IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3736
3737;;
3738; Media instruction working on one full sized and one half sized register (lower half).
3739;
3740; @param 1 The instruction
3741; @param 2 1 if MMX is included, 0 if not.
3742;
3743; @param A0 Pointer to the first full sized media register operand (input/output).
3744; @param A1 Pointer to the second half sized media register operand (input).
3745;
3746%macro IEMIMPL_MEDIA_F1L1 2
3747 %if %2 != 0
3748BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3749 PROLOGUE_2_ARGS
3750 IEMIMPL_MMX_PROLOGUE
3751
3752 movq mm0, [A0]
3753 movq mm1, [A1]
3754 %1 mm0, mm1
3755 movq [A0], mm0
3756
3757 IEMIMPL_MMX_EPILOGUE
3758 EPILOGUE_2_ARGS
3759ENDPROC iemAImpl_ %+ %1 %+ _u64
3760 %endif
3761
3762BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3763 PROLOGUE_2_ARGS
3764 IEMIMPL_SSE_PROLOGUE
3765
3766 movdqu xmm0, [A0]
3767 movdqu xmm1, [A1]
3768 %1 xmm0, xmm1
3769 movdqu [A0], xmm0
3770
3771 IEMIMPL_SSE_EPILOGUE
3772 EPILOGUE_2_ARGS
3773ENDPROC iemAImpl_ %+ %1 %+ _u128
3774%endmacro
3775
3776IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3777IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3778IEMIMPL_MEDIA_F1L1 punpckldq, 1
3779IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3780
3781
3782;;
3783; Media instruction working two half sized input registers (lower half) and a full sized
3784; destination register (vpunpckh*).
3785;
3786; @param 1 The instruction
3787;
3788; @param A0 Pointer to the destination register (full sized, output only).
3789; @param A1 Pointer to the first full sized media source register operand, where we
3790; will only use the lower half as input - but we'll be loading it in full.
3791; @param A2 Pointer to the second full sized media source register operand, where we
3792; will only use the lower half as input - but we'll be loading it in full.
3793;
3794%macro IEMIMPL_MEDIA_F1L1L1 1
3795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3796 PROLOGUE_3_ARGS
3797 IEMIMPL_AVX_PROLOGUE
3798
3799 vmovdqu xmm0, [A1]
3800 vmovdqu xmm1, [A2]
3801 %1 xmm0, xmm0, xmm1
3802 vmovdqu [A0], xmm0
3803
3804 IEMIMPL_AVX_PROLOGUE
3805 EPILOGUE_3_ARGS
3806ENDPROC iemAImpl_ %+ %1 %+ _u128
3807
3808BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3809 PROLOGUE_3_ARGS
3810 IEMIMPL_AVX_PROLOGUE
3811
3812 vmovdqu ymm0, [A1]
3813 vmovdqu ymm1, [A2]
3814 %1 ymm0, ymm0, ymm1
3815 vmovdqu [A0], ymm0
3816
3817 IEMIMPL_AVX_PROLOGUE
3818 EPILOGUE_3_ARGS
3819ENDPROC iemAImpl_ %+ %1 %+ _u256
3820%endmacro
3821
3822IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3823IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3824IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3825IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3826
3827
3828;;
3829; Media instruction working on one full sized and one half sized register (high half).
3830;
3831; @param 1 The instruction
3832; @param 2 1 if MMX is included, 0 if not.
3833;
3834; @param A0 Pointer to the first full sized media register operand (input/output).
3835; @param A1 Pointer to the second full sized media register operand, where we
3836; will only use the upper half as input - but we'll load it in full.
3837;
3838%macro IEMIMPL_MEDIA_F1H1 2
3839IEMIMPL_MEDIA_F1L1 %1, %2
3840%endmacro
3841
3842IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3843IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3844IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3845IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3846
3847
3848;;
3849; Media instruction working two half sized input registers (high half) and a full sized
3850; destination register (vpunpckh*).
3851;
3852; @param 1 The instruction
3853;
3854; @param A0 Pointer to the destination register (full sized, output only).
3855; @param A1 Pointer to the first full sized media source register operand, where we
3856; will only use the upper half as input - but we'll be loading it in full.
3857; @param A2 Pointer to the second full sized media source register operand, where we
3858; will only use the upper half as input - but we'll be loading it in full.
3859;
3860%macro IEMIMPL_MEDIA_F1H1H1 1
3861IEMIMPL_MEDIA_F1L1L1 %1
3862%endmacro
3863
3864IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3865IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3866IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3867IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3868
3869
3870;
3871; Shufflers with evil 8-bit immediates.
3872;
3873
3874BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3875 PROLOGUE_3_ARGS
3876 IEMIMPL_MMX_PROLOGUE
3877
3878 movq mm1, [A1]
3879 movq mm0, mm0 ; paranoia!
3880 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3881 lea T1, [.imm0 xWrtRIP]
3882 lea T1, [T1 + T0]
3883 call T1
3884 movq [A0], mm0
3885
3886 IEMIMPL_MMX_EPILOGUE
3887 EPILOGUE_3_ARGS
3888%assign bImm 0
3889%rep 256
3890.imm %+ bImm:
3891 pshufw mm0, mm1, bImm
3892 ret
3893 %assign bImm bImm + 1
3894%endrep
3895.immEnd: ; 256*5 == 0x500
3896dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3897dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3898ENDPROC iemAImpl_pshufw_u64
3899
3900
3901%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3903 PROLOGUE_3_ARGS
3904 IEMIMPL_SSE_PROLOGUE
3905
3906 movdqu xmm1, [A1]
3907 movdqu xmm0, xmm1 ; paranoia!
3908 lea T1, [.imm0 xWrtRIP]
3909 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3910 lea T1, [T1 + T0*2]
3911 call T1
3912 movdqu [A0], xmm0
3913
3914 IEMIMPL_SSE_EPILOGUE
3915 EPILOGUE_3_ARGS
3916 %assign bImm 0
3917 %rep 256
3918.imm %+ bImm:
3919 %1 xmm0, xmm1, bImm
3920 ret
3921 %assign bImm bImm + 1
3922 %endrep
3923.immEnd: ; 256*6 == 0x600
3924dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3925dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3926ENDPROC iemAImpl_ %+ %1 %+ _u128
3927%endmacro
3928
3929IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3930IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3931IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3932
3933
3934%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3935BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3936 PROLOGUE_3_ARGS
3937 IEMIMPL_SSE_PROLOGUE
3938
3939 vmovdqu ymm1, [A1]
3940 vmovdqu ymm0, ymm1 ; paranoia!
3941 lea T1, [.imm0 xWrtRIP]
3942 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3943 lea T1, [T1 + T0*2]
3944 call T1
3945 vmovdqu [A0], ymm0
3946
3947 IEMIMPL_SSE_EPILOGUE
3948 EPILOGUE_3_ARGS
3949 %assign bImm 0
3950 %rep 256
3951.imm %+ bImm:
3952 %1 ymm0, ymm1, bImm
3953 ret
3954 %assign bImm bImm + 1
3955 %endrep
3956.immEnd: ; 256*6 == 0x600
3957dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3958dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3959ENDPROC iemAImpl_ %+ %1 %+ _u256
3960%endmacro
3961
3962IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3963IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3964IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3965
3966
3967;
3968; Shifts with evil 8-bit immediates.
3969;
3970
3971%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
3972BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
3973 PROLOGUE_2_ARGS
3974 IEMIMPL_MMX_PROLOGUE
3975
3976 movq mm0, [A0]
3977 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
3978 lea T1, [.imm0 xWrtRIP]
3979 lea T1, [T1 + T0]
3980 call T1
3981 movq [A0], mm0
3982
3983 IEMIMPL_MMX_EPILOGUE
3984 EPILOGUE_2_ARGS
3985%assign bImm 0
3986%rep 256
3987.imm %+ bImm:
3988 %1 mm0, bImm
3989 ret
3990 %assign bImm bImm + 1
3991%endrep
3992.immEnd: ; 256*5 == 0x500
3993dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3994dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3995ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
3996%endmacro
3997
3998IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
3999IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4000IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4001IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4002IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4003IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4004IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4005IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4006
4007
4008%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4009BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4010 PROLOGUE_2_ARGS
4011 IEMIMPL_SSE_PROLOGUE
4012
4013 movdqu xmm0, [A0]
4014 lea T1, [.imm0 xWrtRIP]
4015 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4016 lea T1, [T1 + T0*2]
4017 call T1
4018 movdqu [A0], xmm0
4019
4020 IEMIMPL_SSE_EPILOGUE
4021 EPILOGUE_2_ARGS
4022 %assign bImm 0
4023 %rep 256
4024.imm %+ bImm:
4025 %1 xmm0, bImm
4026 ret
4027 %assign bImm bImm + 1
4028 %endrep
4029.immEnd: ; 256*6 == 0x600
4030dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4031dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4032ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4033%endmacro
4034
4035IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4036IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4037IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4038IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4039IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4040IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4041IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4042IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4043IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4044IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4045
4046
4047;
4048; Move byte mask.
4049;
4050
4051BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4052 PROLOGUE_2_ARGS
4053 IEMIMPL_MMX_PROLOGUE
4054
4055 movq mm1, [A1]
4056 pmovmskb T0, mm1
4057 mov [A0], T0
4058%ifdef RT_ARCH_X86
4059 mov dword [A0 + 4], 0
4060%endif
4061 IEMIMPL_MMX_EPILOGUE
4062 EPILOGUE_2_ARGS
4063ENDPROC iemAImpl_pmovmskb_u64
4064
4065BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4066 PROLOGUE_2_ARGS
4067 IEMIMPL_SSE_PROLOGUE
4068
4069 movdqu xmm1, [A1]
4070 pmovmskb T0, xmm1
4071 mov [A0], T0
4072%ifdef RT_ARCH_X86
4073 mov dword [A0 + 4], 0
4074%endif
4075 IEMIMPL_SSE_EPILOGUE
4076 EPILOGUE_2_ARGS
4077ENDPROC iemAImpl_pmovmskb_u128
4078
4079BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4080 PROLOGUE_2_ARGS
4081 IEMIMPL_AVX_PROLOGUE
4082
4083 vmovdqu ymm1, [A1]
4084 vpmovmskb T0, ymm1
4085 mov [A0], T0
4086%ifdef RT_ARCH_X86
4087 mov dword [A0 + 4], 0
4088%endif
4089 IEMIMPL_AVX_EPILOGUE
4090 EPILOGUE_2_ARGS
4091ENDPROC iemAImpl_vpmovmskb_u256
4092
4093
4094;;
4095; Media instruction working on two full sized source registers and one destination (AVX).
4096;
4097; @param 1 The instruction
4098;
4099; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4100; @param A1 Pointer to the destination media register size operand (output).
4101; @param A2 Pointer to the first source media register size operand (input).
4102; @param A3 Pointer to the second source media register size operand (input).
4103;
4104%macro IEMIMPL_MEDIA_F3 1
4105BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4106 PROLOGUE_4_ARGS
4107 IEMIMPL_AVX_PROLOGUE
4108
4109 vmovdqu xmm0, [A2]
4110 vmovdqu xmm1, [A3]
4111 %1 xmm0, xmm0, xmm1
4112 vmovdqu [A1], xmm0
4113
4114 IEMIMPL_AVX_PROLOGUE
4115 EPILOGUE_4_ARGS
4116ENDPROC iemAImpl_ %+ %1 %+ _u128
4117
4118BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4119 PROLOGUE_4_ARGS
4120 IEMIMPL_AVX_PROLOGUE
4121
4122 vmovdqu ymm0, [A2]
4123 vmovdqu ymm1, [A3]
4124 %1 ymm0, ymm0, ymm1
4125 vmovdqu [A1], ymm0
4126
4127 IEMIMPL_AVX_PROLOGUE
4128 EPILOGUE_4_ARGS
4129ENDPROC iemAImpl_ %+ %1 %+ _u256
4130%endmacro
4131
4132IEMIMPL_MEDIA_F3 vpshufb
4133IEMIMPL_MEDIA_F3 vpand
4134IEMIMPL_MEDIA_F3 vpminub
4135IEMIMPL_MEDIA_F3 vpminuw
4136IEMIMPL_MEDIA_F3 vpminud
4137IEMIMPL_MEDIA_F3 vpminsb
4138IEMIMPL_MEDIA_F3 vpminsw
4139IEMIMPL_MEDIA_F3 vpminsd
4140IEMIMPL_MEDIA_F3 vpmaxub
4141IEMIMPL_MEDIA_F3 vpmaxuw
4142IEMIMPL_MEDIA_F3 vpmaxud
4143IEMIMPL_MEDIA_F3 vpmaxsb
4144IEMIMPL_MEDIA_F3 vpmaxsw
4145IEMIMPL_MEDIA_F3 vpmaxsd
4146IEMIMPL_MEDIA_F3 vpandn
4147IEMIMPL_MEDIA_F3 vpor
4148IEMIMPL_MEDIA_F3 vpxor
4149IEMIMPL_MEDIA_F3 vpcmpeqb
4150IEMIMPL_MEDIA_F3 vpcmpeqw
4151IEMIMPL_MEDIA_F3 vpcmpeqd
4152IEMIMPL_MEDIA_F3 vpcmpeqq
4153IEMIMPL_MEDIA_F3 vpcmpgtb
4154IEMIMPL_MEDIA_F3 vpcmpgtw
4155IEMIMPL_MEDIA_F3 vpcmpgtd
4156IEMIMPL_MEDIA_F3 vpcmpgtq
4157IEMIMPL_MEDIA_F3 vpaddb
4158IEMIMPL_MEDIA_F3 vpaddw
4159IEMIMPL_MEDIA_F3 vpaddd
4160IEMIMPL_MEDIA_F3 vpaddq
4161IEMIMPL_MEDIA_F3 vpsubb
4162IEMIMPL_MEDIA_F3 vpsubw
4163IEMIMPL_MEDIA_F3 vpsubd
4164IEMIMPL_MEDIA_F3 vpsubq
4165
4166
4167;;
4168; Media instruction working on two full sized source registers and one destination (AVX),
4169; but no XSAVE state pointer argument.
4170;
4171; @param 1 The instruction
4172;
4173; @param A0 Pointer to the destination media register size operand (output).
4174; @param A1 Pointer to the first source media register size operand (input).
4175; @param A2 Pointer to the second source media register size operand (input).
4176;
4177%macro IEMIMPL_MEDIA_OPT_F3 1
4178BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4179 PROLOGUE_3_ARGS
4180 IEMIMPL_AVX_PROLOGUE
4181
4182 vmovdqu xmm0, [A1]
4183 vmovdqu xmm1, [A2]
4184 %1 xmm0, xmm0, xmm1
4185 vmovdqu [A0], xmm0
4186
4187 IEMIMPL_AVX_PROLOGUE
4188 EPILOGUE_3_ARGS
4189ENDPROC iemAImpl_ %+ %1 %+ _u128
4190
4191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4192 PROLOGUE_3_ARGS
4193 IEMIMPL_AVX_PROLOGUE
4194
4195 vmovdqu ymm0, [A1]
4196 vmovdqu ymm1, [A2]
4197 %1 ymm0, ymm0, ymm1
4198 vmovdqu [A0], ymm0
4199
4200 IEMIMPL_AVX_PROLOGUE
4201 EPILOGUE_3_ARGS
4202ENDPROC iemAImpl_ %+ %1 %+ _u256
4203%endmacro
4204
4205IEMIMPL_MEDIA_OPT_F3 vpacksswb
4206IEMIMPL_MEDIA_OPT_F3 vpackssdw
4207IEMIMPL_MEDIA_OPT_F3 vpackuswb
4208IEMIMPL_MEDIA_OPT_F3 vpackusdw
4209IEMIMPL_MEDIA_OPT_F3 vpmullw
4210IEMIMPL_MEDIA_OPT_F3 vpmulld
4211IEMIMPL_MEDIA_OPT_F3 vpmulhw
4212IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4213IEMIMPL_MEDIA_OPT_F3 vpavgb
4214IEMIMPL_MEDIA_OPT_F3 vpavgw
4215IEMIMPL_MEDIA_OPT_F3 vpsignb
4216IEMIMPL_MEDIA_OPT_F3 vpsignw
4217IEMIMPL_MEDIA_OPT_F3 vpsignd
4218IEMIMPL_MEDIA_OPT_F3 vphaddw
4219IEMIMPL_MEDIA_OPT_F3 vphaddd
4220IEMIMPL_MEDIA_OPT_F3 vphsubw
4221IEMIMPL_MEDIA_OPT_F3 vphsubd
4222IEMIMPL_MEDIA_OPT_F3 vphaddsw
4223IEMIMPL_MEDIA_OPT_F3 vphsubsw
4224IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4225IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4226IEMIMPL_MEDIA_OPT_F3 vpsadbw
4227IEMIMPL_MEDIA_OPT_F3 vpmuldq
4228IEMIMPL_MEDIA_OPT_F3 vpmuludq
4229IEMIMPL_MEDIA_OPT_F3 vunpcklps
4230IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4231IEMIMPL_MEDIA_OPT_F3 vunpckhps
4232IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4233
4234;;
4235; Media instruction working on one full sized source registers and one destination (AVX),
4236; but no XSAVE state pointer argument.
4237;
4238; @param 1 The instruction
4239; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4240;
4241; @param A0 Pointer to the destination media register size operand (output).
4242; @param A1 Pointer to the source media register size operand (input).
4243;
4244%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4245BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4246 PROLOGUE_2_ARGS
4247 IEMIMPL_AVX_PROLOGUE
4248
4249 vmovdqu xmm0, [A1]
4250 %1 xmm0, xmm0
4251 vmovdqu [A0], xmm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_2_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256
4257 %if %2 == 1
4258BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4259 PROLOGUE_2_ARGS
4260 IEMIMPL_AVX_PROLOGUE
4261
4262 vmovdqu ymm0, [A1]
4263 %1 ymm0, ymm0
4264 vmovdqu [A0], ymm0
4265
4266 IEMIMPL_AVX_PROLOGUE
4267 EPILOGUE_2_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269 %endif
4270%endmacro
4271
4272IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4273IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4274IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4275IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4276
4277
4278;
4279; The SSE 4.2 crc32
4280;
4281; @param A1 Pointer to the 32-bit destination.
4282; @param A2 The source operand, sized according to the suffix.
4283;
4284BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4285 PROLOGUE_2_ARGS
4286
4287 mov T0_32, [A0]
4288 crc32 T0_32, A1_8
4289 mov [A0], T0_32
4290
4291 EPILOGUE_2_ARGS
4292ENDPROC iemAImpl_crc32_u8
4293
4294BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4295 PROLOGUE_2_ARGS
4296
4297 mov T0_32, [A0]
4298 crc32 T0_32, A1_16
4299 mov [A0], T0_32
4300
4301 EPILOGUE_2_ARGS
4302ENDPROC iemAImpl_crc32_u16
4303
4304BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4305 PROLOGUE_2_ARGS
4306
4307 mov T0_32, [A0]
4308 crc32 T0_32, A1_32
4309 mov [A0], T0_32
4310
4311 EPILOGUE_2_ARGS
4312ENDPROC iemAImpl_crc32_u32
4313
4314%ifdef RT_ARCH_AMD64
4315BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4316 PROLOGUE_2_ARGS
4317
4318 mov T0_32, [A0]
4319 crc32 T0, A1
4320 mov [A0], T0_32
4321
4322 EPILOGUE_2_ARGS
4323ENDPROC iemAImpl_crc32_u64
4324%endif
4325
4326
4327;
4328; PTEST (SSE 4.1)
4329;
4330; @param A0 Pointer to the first source operand (aka readonly destination).
4331; @param A1 Pointer to the second source operand.
4332; @param A2 Pointer to the EFLAGS register.
4333;
4334BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4335 PROLOGUE_3_ARGS
4336 IEMIMPL_SSE_PROLOGUE
4337
4338 movdqu xmm0, [A0]
4339 movdqu xmm1, [A1]
4340 ptest xmm0, xmm1
4341 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4342
4343 IEMIMPL_SSE_EPILOGUE
4344 EPILOGUE_3_ARGS
4345ENDPROC iemAImpl_ptest_u128
4346
4347BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4348 PROLOGUE_3_ARGS
4349 IEMIMPL_SSE_PROLOGUE
4350
4351 vmovdqu ymm0, [A0]
4352 vmovdqu ymm1, [A1]
4353 vptest ymm0, ymm1
4354 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4355
4356 IEMIMPL_SSE_EPILOGUE
4357 EPILOGUE_3_ARGS
4358ENDPROC iemAImpl_vptest_u256
4359
4360
4361;;
4362; Template for the [v]pmov{s,z}x* instructions
4363;
4364; @param 1 The instruction
4365;
4366; @param A0 Pointer to the destination media register size operand (output).
4367; @param A1 The source operand value (input).
4368;
4369%macro IEMIMPL_V_PMOV_SZ_X 1
4370BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4371 PROLOGUE_2_ARGS
4372 IEMIMPL_SSE_PROLOGUE
4373
4374 movd xmm0, A1
4375 %1 xmm0, xmm0
4376 vmovdqu [A0], xmm0
4377
4378 IEMIMPL_SSE_PROLOGUE
4379 EPILOGUE_2_ARGS
4380ENDPROC iemAImpl_ %+ %1 %+ _u128
4381
4382BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4383 PROLOGUE_2_ARGS
4384 IEMIMPL_AVX_PROLOGUE
4385
4386 movd xmm0, A1
4387 v %+ %1 xmm0, xmm0
4388 vmovdqu [A0], xmm0
4389
4390 IEMIMPL_AVX_PROLOGUE
4391 EPILOGUE_2_ARGS
4392ENDPROC iemAImpl_v %+ %1 %+ _u128
4393
4394BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4395 PROLOGUE_2_ARGS
4396 IEMIMPL_AVX_PROLOGUE
4397
4398 movdqu xmm0, [A1]
4399 v %+ %1 ymm0, xmm0
4400 vmovdqu [A0], ymm0
4401
4402 IEMIMPL_AVX_PROLOGUE
4403 EPILOGUE_2_ARGS
4404ENDPROC iemAImpl_v %+ %1 %+ _u256
4405%endmacro
4406
4407IEMIMPL_V_PMOV_SZ_X pmovsxbw
4408IEMIMPL_V_PMOV_SZ_X pmovsxbd
4409IEMIMPL_V_PMOV_SZ_X pmovsxbq
4410IEMIMPL_V_PMOV_SZ_X pmovsxwd
4411IEMIMPL_V_PMOV_SZ_X pmovsxwq
4412IEMIMPL_V_PMOV_SZ_X pmovsxdq
4413
4414IEMIMPL_V_PMOV_SZ_X pmovzxbw
4415IEMIMPL_V_PMOV_SZ_X pmovzxbd
4416IEMIMPL_V_PMOV_SZ_X pmovzxbq
4417IEMIMPL_V_PMOV_SZ_X pmovzxwd
4418IEMIMPL_V_PMOV_SZ_X pmovzxwq
4419IEMIMPL_V_PMOV_SZ_X pmovzxdq
4420
4421
4422;;
4423; Need to move this as well somewhere better?
4424;
4425struc IEMSSERESULT
4426 .uResult resd 4
4427 .MXCSR resd 1
4428endstruc
4429
4430
4431;;
4432; Need to move this as well somewhere better?
4433;
4434struc IEMAVX128RESULT
4435 .uResult resd 4
4436 .MXCSR resd 1
4437endstruc
4438
4439
4440;;
4441; Need to move this as well somewhere better?
4442;
4443struc IEMAVX256RESULT
4444 .uResult resd 8
4445 .MXCSR resd 1
4446endstruc
4447
4448
4449;;
4450; Initialize the SSE MXCSR register using the guest value partially to
4451; account for rounding mode.
4452;
4453; @uses 4 bytes of stack to save the original value, T0.
4454; @param 1 Expression giving the address of the FXSTATE of the guest.
4455;
4456%macro SSE_LD_FXSTATE_MXCSR 1
4457 sub xSP, 4
4458
4459 stmxcsr [xSP]
4460 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4461 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4462 or T0_32, X86_MXCSR_XCPT_MASK
4463 sub xSP, 4
4464 mov [xSP], T0_32
4465 ldmxcsr [xSP]
4466 add xSP, 4
4467%endmacro
4468
4469
4470;;
4471; Restores the SSE MXCSR register with the original value.
4472;
4473; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4474; @param 1 Expression giving the address where to return the MXCSR value.
4475; @param 2 Expression giving the address of the FXSTATE of the guest.
4476;
4477; @note Restores the stack pointer.
4478;
4479%macro SSE_ST_FXSTATE_MXCSR 2
4480 sub xSP, 4
4481 stmxcsr [xSP]
4482 mov T0_32, [xSP]
4483 add xSP, 4
4484 ; Merge the status bits into the original MXCSR value.
4485 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4486 and T0_32, X86_MXCSR_XCPT_FLAGS
4487 or T0_32, T1_32
4488 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4489
4490 ldmxcsr [xSP]
4491 add xSP, 4
4492%endmacro
4493
4494
4495;;
4496; Initialize the SSE MXCSR register using the guest value partially to
4497; account for rounding mode.
4498;
4499; @uses 4 bytes of stack to save the original value.
4500; @param 1 Expression giving the address of the FXSTATE of the guest.
4501;
4502%macro AVX_LD_XSAVEAREA_MXCSR 1
4503 sub xSP, 4
4504
4505 stmxcsr [xSP]
4506 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4507 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4508 sub xSP, 4
4509 mov [xSP], T0_32
4510 ldmxcsr [xSP]
4511 add xSP, 4
4512%endmacro
4513
4514
4515;;
4516; Restores the AVX128 MXCSR register with the original value.
4517;
4518; @param 1 Expression giving the address where to return the MXCSR value.
4519;
4520; @note Restores the stack pointer.
4521;
4522%macro AVX128_ST_XSAVEAREA_MXCSR 1
4523 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4524
4525 ldmxcsr [xSP]
4526 add xSP, 4
4527%endmacro
4528
4529
4530;;
4531; Restores the AVX256 MXCSR register with the original value.
4532;
4533; @param 1 Expression giving the address where to return the MXCSR value.
4534;
4535; @note Restores the stack pointer.
4536;
4537%macro AVX256_ST_XSAVEAREA_MXCSR 1
4538 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4539
4540 ldmxcsr [xSP]
4541 add xSP, 4
4542%endmacro
4543
4544
4545;;
4546; Floating point instruction working on two full sized registers.
4547;
4548; @param 1 The instruction
4549; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4550;
4551; @param A0 FPU context (FXSTATE or XSAVEAREA).
4552; @param A1 Where to return the result including the MXCSR value.
4553; @param A2 Pointer to the first media register size operand (input/output).
4554; @param A3 Pointer to the second media register size operand (input).
4555;
4556%macro IEMIMPL_FP_F2 2
4557BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4558 PROLOGUE_4_ARGS
4559 IEMIMPL_SSE_PROLOGUE
4560 SSE_LD_FXSTATE_MXCSR A0
4561
4562 movdqu xmm0, [A2]
4563 movdqu xmm1, [A3]
4564 %1 xmm0, xmm1
4565 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4566
4567 SSE_ST_FXSTATE_MXCSR A1, A0
4568 IEMIMPL_SSE_PROLOGUE
4569 EPILOGUE_4_ARGS
4570ENDPROC iemAImpl_ %+ %1 %+ _u128
4571
4572 %if %2 == 3
4573BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4574 PROLOGUE_4_ARGS
4575 IEMIMPL_AVX_PROLOGUE
4576 AVX_LD_XSAVEAREA_MXCSR A0
4577
4578 vmovdqu xmm0, [A2]
4579 vmovdqu xmm1, [A3]
4580 v %+ %1 xmm0, xmm0, xmm1
4581 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4582
4583 AVX128_ST_XSAVEAREA_MXCSR A1
4584 IEMIMPL_AVX_PROLOGUE
4585 EPILOGUE_4_ARGS
4586ENDPROC iemAImpl_v %+ %1 %+ _u128
4587
4588BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4589 PROLOGUE_4_ARGS
4590 IEMIMPL_AVX_PROLOGUE
4591 AVX_LD_XSAVEAREA_MXCSR A0
4592
4593 vmovdqu ymm0, [A2]
4594 vmovdqu ymm1, [A3]
4595 v %+ %1 ymm0, ymm0, ymm1
4596 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4597
4598 AVX256_ST_XSAVEAREA_MXCSR A1
4599 IEMIMPL_AVX_PROLOGUE
4600 EPILOGUE_4_ARGS
4601ENDPROC iemAImpl_v %+ %1 %+ _u256
4602 %elif %2 == 2
4603BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4604 PROLOGUE_4_ARGS
4605 IEMIMPL_AVX_PROLOGUE
4606 AVX_LD_XSAVEAREA_MXCSR A0
4607
4608 vmovdqu xmm0, [A2]
4609 vmovdqu xmm1, [A3]
4610 v %+ %1 xmm0, xmm1
4611 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4612
4613 AVX128_ST_XSAVEAREA_MXCSR A1
4614 IEMIMPL_AVX_PROLOGUE
4615 EPILOGUE_4_ARGS
4616ENDPROC iemAImpl_v %+ %1 %+ _u128
4617
4618BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4619 PROLOGUE_4_ARGS
4620 IEMIMPL_AVX_PROLOGUE
4621 AVX_LD_XSAVEAREA_MXCSR A0
4622
4623 vmovdqu ymm0, [A2]
4624 vmovdqu ymm1, [A3]
4625 v %+ %1 ymm0, ymm1
4626 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4627
4628 AVX256_ST_XSAVEAREA_MXCSR A1
4629 IEMIMPL_AVX_PROLOGUE
4630 EPILOGUE_4_ARGS
4631ENDPROC iemAImpl_v %+ %1 %+ _u256
4632 %endif
4633%endmacro
4634
4635IEMIMPL_FP_F2 addps, 3
4636IEMIMPL_FP_F2 addpd, 3
4637IEMIMPL_FP_F2 mulps, 3
4638IEMIMPL_FP_F2 mulpd, 3
4639IEMIMPL_FP_F2 subps, 3
4640IEMIMPL_FP_F2 subpd, 3
4641IEMIMPL_FP_F2 minps, 3
4642IEMIMPL_FP_F2 minpd, 3
4643IEMIMPL_FP_F2 divps, 3
4644IEMIMPL_FP_F2 divpd, 3
4645IEMIMPL_FP_F2 maxps, 3
4646IEMIMPL_FP_F2 maxpd, 3
4647IEMIMPL_FP_F2 haddps, 3
4648IEMIMPL_FP_F2 haddpd, 3
4649IEMIMPL_FP_F2 hsubps, 3
4650IEMIMPL_FP_F2 hsubpd, 3
4651IEMIMPL_FP_F2 addsubps, 3
4652IEMIMPL_FP_F2 addsubpd, 3
4653
4654
4655;;
4656; These are actually unary operations but to keep it simple
4657; we treat them as binary for now, so the output result is
4658; always in sync with the register where the result might get written
4659; to.
4660IEMIMPL_FP_F2 sqrtps, 2
4661IEMIMPL_FP_F2 sqrtpd, 2
4662IEMIMPL_FP_F2 cvtdq2ps, 2
4663IEMIMPL_FP_F2 cvtps2dq, 2
4664IEMIMPL_FP_F2 cvttps2dq, 2
4665IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4666IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4667IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4668
4669
4670;;
4671; Floating point instruction working on a full sized register and a single precision operand.
4672;
4673; @param 1 The instruction
4674;
4675; @param A0 FPU context (FXSTATE or XSAVEAREA).
4676; @param A1 Where to return the result including the MXCSR value.
4677; @param A2 Pointer to the first media register size operand (input/output).
4678; @param A3 Pointer to the second single precision floating point value (input).
4679;
4680%macro IEMIMPL_FP_F2_R32 1
4681BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 12
4682 PROLOGUE_4_ARGS
4683 IEMIMPL_SSE_PROLOGUE
4684 SSE_LD_FXSTATE_MXCSR A0
4685
4686 movdqu xmm0, [A2]
4687 movd xmm1, [A3]
4688 %1 xmm0, xmm1
4689 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4690
4691 SSE_ST_FXSTATE_MXCSR A1, A0
4692 IEMIMPL_SSE_PROLOGUE
4693 EPILOGUE_4_ARGS
4694ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4695
4696BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 12
4697 PROLOGUE_4_ARGS
4698 IEMIMPL_AVX_PROLOGUE
4699 AVX_LD_XSAVEAREA_MXCSR A0
4700
4701 vmovdqu xmm0, [A2]
4702 vmovd xmm1, [A3]
4703 v %+ %1 xmm0, xmm0, xmm1
4704 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4705
4706 AVX128_ST_XSAVEAREA_MXCSR A1
4707 IEMIMPL_AVX_PROLOGUE
4708 EPILOGUE_4_ARGS
4709ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4710%endmacro
4711
4712IEMIMPL_FP_F2_R32 addss
4713IEMIMPL_FP_F2_R32 mulss
4714IEMIMPL_FP_F2_R32 subss
4715IEMIMPL_FP_F2_R32 minss
4716IEMIMPL_FP_F2_R32 divss
4717IEMIMPL_FP_F2_R32 maxss
4718IEMIMPL_FP_F2_R32 cvtss2sd
4719IEMIMPL_FP_F2_R32 sqrtss
4720
4721
4722;;
4723; Floating point instruction working on a full sized register and a double precision operand.
4724;
4725; @param 1 The instruction
4726;
4727; @param A0 FPU context (FXSTATE or XSAVEAREA).
4728; @param A1 Where to return the result including the MXCSR value.
4729; @param A2 Pointer to the first media register size operand (input/output).
4730; @param A3 Pointer to the second double precision floating point value (input).
4731;
4732%macro IEMIMPL_FP_F2_R64 1
4733BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 12
4734 PROLOGUE_4_ARGS
4735 IEMIMPL_SSE_PROLOGUE
4736 SSE_LD_FXSTATE_MXCSR A0
4737
4738 movdqu xmm0, [A2]
4739 movq xmm1, [A3]
4740 %1 xmm0, xmm1
4741 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4742
4743 SSE_ST_FXSTATE_MXCSR A1, A0
4744 IEMIMPL_SSE_PROLOGUE
4745 EPILOGUE_4_ARGS
4746ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4747
4748BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 12
4749 PROLOGUE_4_ARGS
4750 IEMIMPL_AVX_PROLOGUE
4751 AVX_LD_XSAVEAREA_MXCSR A0
4752
4753 vmovdqu xmm0, [A2]
4754 vmovq xmm1, [A3]
4755 v %+ %1 xmm0, xmm0, xmm1
4756 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4757
4758 AVX128_ST_XSAVEAREA_MXCSR A1
4759 IEMIMPL_AVX_PROLOGUE
4760 EPILOGUE_4_ARGS
4761ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4762%endmacro
4763
4764IEMIMPL_FP_F2_R64 addsd
4765IEMIMPL_FP_F2_R64 mulsd
4766IEMIMPL_FP_F2_R64 subsd
4767IEMIMPL_FP_F2_R64 minsd
4768IEMIMPL_FP_F2_R64 divsd
4769IEMIMPL_FP_F2_R64 maxsd
4770IEMIMPL_FP_F2_R64 cvtsd2ss
4771IEMIMPL_FP_F2_R64 sqrtsd
4772
4773
4774;;
4775; Macro for the cvtpd2ps/cvtps2pd instructions.
4776;
4777; 1 The instruction name.
4778; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4779;
4780; @param A0 FPU context (FXSTATE or XSAVEAREA).
4781; @param A1 Where to return the result including the MXCSR value.
4782; @param A2 Pointer to the first media register size operand (input/output).
4783; @param A3 Pointer to the second media register size operand (input).
4784;
4785%macro IEMIMPL_CVT_F2 2
4786BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4787 PROLOGUE_4_ARGS
4788 IEMIMPL_SSE_PROLOGUE
4789 SSE_LD_FXSTATE_MXCSR A0
4790
4791 movdqu xmm0, [A2]
4792 movdqu xmm1, [A3]
4793 %1 xmm0, xmm1
4794 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4795
4796 SSE_ST_FXSTATE_MXCSR A1, A0
4797 IEMIMPL_SSE_PROLOGUE
4798 EPILOGUE_4_ARGS
4799ENDPROC iemAImpl_ %+ %1 %+ _u128
4800
4801BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4802 PROLOGUE_4_ARGS
4803 IEMIMPL_AVX_PROLOGUE
4804 AVX_LD_XSAVEAREA_MXCSR A0
4805
4806 vmovdqu xmm0, [A2]
4807 vmovdqu xmm1, [A3]
4808 v %+ %1 xmm0, xmm1
4809 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4810
4811 AVX128_ST_XSAVEAREA_MXCSR A1
4812 IEMIMPL_AVX_PROLOGUE
4813 EPILOGUE_4_ARGS
4814ENDPROC iemAImpl_v %+ %1 %+ _u128
4815
4816BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4817 PROLOGUE_4_ARGS
4818 IEMIMPL_AVX_PROLOGUE
4819 AVX_LD_XSAVEAREA_MXCSR A0
4820
4821 vmovdqu ymm0, [A2]
4822 vmovdqu ymm1, [A3]
4823 %if %2 == 0
4824 v %+ %1 xmm0, ymm1
4825 %else
4826 v %+ %1 ymm0, xmm1
4827 %endif
4828 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4829
4830 AVX256_ST_XSAVEAREA_MXCSR A1
4831 IEMIMPL_AVX_PROLOGUE
4832 EPILOGUE_4_ARGS
4833ENDPROC iemAImpl_v %+ %1 %+ _u256
4834%endmacro
4835
4836IEMIMPL_CVT_F2 cvtpd2ps, 0
4837IEMIMPL_CVT_F2 cvtps2pd, 1
4838
4839
4840;;
4841; shufps instructions with 8-bit immediates.
4842;
4843; @param A0 Pointer to the destination media register size operand (input/output).
4844; @param A1 Pointer to the first source media register size operand (input).
4845; @param A2 The 8-bit immediate
4846;
4847BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4848 PROLOGUE_3_ARGS
4849 IEMIMPL_SSE_PROLOGUE
4850
4851 movdqu xmm0, [A0]
4852 movdqu xmm1, [A1]
4853 lea T1, [.imm0 xWrtRIP]
4854 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4855 lea T1, [T1 + T0*2]
4856 call T1
4857 movdqu [A0], xmm0
4858
4859 IEMIMPL_SSE_EPILOGUE
4860 EPILOGUE_3_ARGS
4861 %assign bImm 0
4862 %rep 256
4863.imm %+ bImm:
4864 shufps xmm0, xmm1, bImm
4865 ret
4866 int3
4867 %assign bImm bImm + 1
4868 %endrep
4869.immEnd: ; 256*6 == 0x600
4870dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4871dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4872ENDPROC iemAImpl_shufps_u128
4873
4874
4875;;
4876; shufpd instruction with 8-bit immediates.
4877;
4878; @param A0 Pointer to the destination media register size operand (input/output).
4879; @param A1 Pointer to the first source media register size operand (input).
4880; @param A2 The 8-bit immediate
4881;
4882BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4883 PROLOGUE_3_ARGS
4884 IEMIMPL_SSE_PROLOGUE
4885
4886 movdqu xmm0, [A0]
4887 movdqu xmm1, [A1]
4888 lea T1, [.imm0 xWrtRIP]
4889 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4890 lea T1, [T1 + T0*2]
4891 call T1
4892 movdqu [A0], xmm0
4893
4894 IEMIMPL_SSE_EPILOGUE
4895 EPILOGUE_3_ARGS
4896 %assign bImm 0
4897 %rep 256
4898.imm %+ bImm:
4899 shufpd xmm0, xmm1, bImm
4900 ret
4901 %assign bImm bImm + 1
4902 %endrep
4903.immEnd: ; 256*6 == 0x600
4904dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4905dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4906ENDPROC iemAImpl_shufpd_u128
4907
4908
4909;;
4910; vshufp{s,d} instructions with 8-bit immediates.
4911;
4912; @param 1 The instruction name.
4913;
4914; @param A0 Pointer to the destination media register size operand (output).
4915; @param A1 Pointer to the first source media register size operand (input).
4916; @param A2 Pointer to the second source media register size operand (input).
4917; @param A3 The 8-bit immediate
4918;
4919%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4920BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4921 PROLOGUE_4_ARGS
4922 IEMIMPL_AVX_PROLOGUE
4923
4924 movdqu xmm0, [A1]
4925 movdqu xmm1, [A2]
4926 lea T1, [.imm0 xWrtRIP]
4927 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4928 lea T1, [T1 + T0*2]
4929 call T1
4930 movdqu [A0], xmm0
4931
4932 IEMIMPL_AVX_EPILOGUE
4933 EPILOGUE_4_ARGS
4934 %assign bImm 0
4935 %rep 256
4936.imm %+ bImm:
4937 %1 xmm0, xmm0, xmm1, bImm
4938 ret
4939 %assign bImm bImm + 1
4940 %endrep
4941.immEnd: ; 256*6 == 0x600
4942dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4943dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4944ENDPROC iemAImpl_ %+ %1 %+ _u128
4945
4946BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4947 PROLOGUE_4_ARGS
4948 IEMIMPL_AVX_PROLOGUE
4949
4950 vmovdqu ymm0, [A1]
4951 vmovdqu ymm1, [A2]
4952 lea T1, [.imm0 xWrtRIP]
4953 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4954 lea T1, [T1 + T0*2]
4955 call T1
4956 vmovdqu [A0], ymm0
4957
4958 IEMIMPL_AVX_EPILOGUE
4959 EPILOGUE_4_ARGS
4960 %assign bImm 0
4961 %rep 256
4962.imm %+ bImm:
4963 %1 ymm0, ymm0, ymm1, bImm
4964 ret
4965 %assign bImm bImm + 1
4966 %endrep
4967.immEnd: ; 256*6 == 0x600
4968dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4969dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4970ENDPROC iemAImpl_ %+ %1 %+ _u256
4971%endmacro
4972
4973IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
4974IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
4975
4976
4977;;
4978; One of the [p]blendv{b,ps,pd} variants
4979;
4980; @param 1 The instruction
4981;
4982; @param A0 Pointer to the first media register sized operand (input/output).
4983; @param A1 Pointer to the second media sized value (input).
4984; @param A2 Pointer to the media register sized mask value (input).
4985;
4986%macro IEMIMPL_P_BLEND 1
4987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4988 PROLOGUE_3_ARGS
4989 IEMIMPL_SSE_PROLOGUE
4990
4991 movdqu xmm0, [A2] ; This is implicit
4992 movdqu xmm1, [A0]
4993 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
4994 %1 xmm1, xmm2
4995 movdqu [A0], xmm1
4996
4997 IEMIMPL_SSE_PROLOGUE
4998 EPILOGUE_3_ARGS
4999ENDPROC iemAImpl_ %+ %1 %+ _u128
5000%endmacro
5001
5002IEMIMPL_P_BLEND pblendvb
5003IEMIMPL_P_BLEND blendvps
5004IEMIMPL_P_BLEND blendvpd
5005
5006
5007;;
5008; One of the v[p]blendv{b,ps,pd} variants
5009;
5010; @param 1 The instruction
5011;
5012; @param A0 Pointer to the first media register sized operand (output).
5013; @param A1 Pointer to the first media register sized operand (input).
5014; @param A2 Pointer to the second media register sized operand (input).
5015; @param A3 Pointer to the media register sized mask value (input).
5016%macro IEMIMPL_AVX_P_BLEND 1
5017BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5018 PROLOGUE_4_ARGS
5019 IEMIMPL_AVX_PROLOGUE
5020
5021 vmovdqu xmm0, [A1]
5022 vmovdqu xmm1, [A2]
5023 vmovdqu xmm2, [A3]
5024 %1 xmm0, xmm0, xmm1, xmm2
5025 vmovdqu [A0], xmm0
5026
5027 IEMIMPL_AVX_PROLOGUE
5028 EPILOGUE_4_ARGS
5029ENDPROC iemAImpl_ %+ %1 %+ _u128
5030
5031BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5032 PROLOGUE_4_ARGS
5033 IEMIMPL_AVX_PROLOGUE
5034
5035 vmovdqu ymm0, [A1]
5036 vmovdqu ymm1, [A2]
5037 vmovdqu ymm2, [A3]
5038 %1 ymm0, ymm0, ymm1, ymm2
5039 vmovdqu [A0], ymm0
5040
5041 IEMIMPL_AVX_PROLOGUE
5042 EPILOGUE_4_ARGS
5043ENDPROC iemAImpl_ %+ %1 %+ _u256
5044%endmacro
5045
5046IEMIMPL_AVX_P_BLEND vpblendvb
5047IEMIMPL_AVX_P_BLEND vblendvps
5048IEMIMPL_AVX_P_BLEND vblendvpd
5049
5050
5051;;
5052; palignr mm1, mm2/m64 instruction.
5053;
5054; @param A0 Pointer to the first media register sized operand (output).
5055; @param A1 The second register sized operand (input).
5056; @param A2 The 8-bit immediate.
5057BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5058 PROLOGUE_3_ARGS
5059 IEMIMPL_MMX_PROLOGUE
5060
5061 movq mm0, [A0]
5062 movq mm1, A1
5063 lea T1, [.imm0 xWrtRIP]
5064 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5065 lea T1, [T1 + T0*2]
5066 call T1
5067 movq [A0], mm0
5068
5069 IEMIMPL_MMX_EPILOGUE
5070 EPILOGUE_3_ARGS
5071 %assign bImm 0
5072 %rep 256
5073.imm %+ bImm:
5074 palignr mm0, mm1, bImm
5075 ret
5076 %assign bImm bImm + 1
5077 %endrep
5078.immEnd: ; 256*6 == 0x600
5079dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5080dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5081ENDPROC iemAImpl_palignr_u64
5082
5083
5084;;
5085; SSE instructions with 8-bit immediates of the form
5086; xxx xmm1, xmm2, imm8.
5087; where the instruction encoding takes up 6 bytes.
5088;
5089; @param 1 The instruction name.
5090;
5091; @param A0 Pointer to the first media register size operand (input/output).
5092; @param A1 Pointer to the second source media register size operand (input).
5093; @param A2 The 8-bit immediate
5094;
5095%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5096BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5097 PROLOGUE_3_ARGS
5098 IEMIMPL_SSE_PROLOGUE
5099
5100 movdqu xmm0, [A0]
5101 movdqu xmm1, [A1]
5102 lea T1, [.imm0 xWrtRIP]
5103 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5104 lea T1, [T1 + T0*2]
5105 call T1
5106 movdqu [A0], xmm0
5107
5108 IEMIMPL_SSE_EPILOGUE
5109 EPILOGUE_3_ARGS
5110 %assign bImm 0
5111 %rep 256
5112.imm %+ bImm:
5113 %1 xmm0, xmm1, bImm
5114 ret
5115 int3
5116 %assign bImm bImm + 1
5117 %endrep
5118.immEnd: ; 256*8 == 0x800
5119dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5120dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5121ENDPROC iemAImpl_ %+ %1 %+ _u128
5122%endmacro
5123
5124IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5125IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5126IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5127IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5128IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5129
5130
5131;;
5132; AVX instructions with 8-bit immediates of the form
5133; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5134; where the instruction encoding takes up 6 bytes.
5135;
5136; @param 1 The instruction name.
5137; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5138;
5139; @param A0 Pointer to the destination media register size operand (output).
5140; @param A1 Pointer to the first source media register size operand (input).
5141; @param A2 Pointer to the second source media register size operand (input).
5142; @param A3 The 8-bit immediate
5143;
5144%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5146 PROLOGUE_4_ARGS
5147 IEMIMPL_AVX_PROLOGUE
5148
5149 movdqu xmm0, [A1]
5150 movdqu xmm1, [A2]
5151 lea T1, [.imm0 xWrtRIP]
5152 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5153 lea T1, [T1 + T0*2]
5154 call T1
5155 movdqu [A0], xmm0
5156
5157 IEMIMPL_AVX_EPILOGUE
5158 EPILOGUE_4_ARGS
5159 %assign bImm 0
5160 %rep 256
5161.imm %+ bImm:
5162 %1 xmm0, xmm0, xmm1, bImm
5163 ret
5164 int3
5165 %assign bImm bImm + 1
5166 %endrep
5167.immEnd: ; 256*8 == 0x800
5168dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5169dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5170ENDPROC iemAImpl_ %+ %1 %+ _u128
5171
5172 %if %2 == 1
5173BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5174 PROLOGUE_4_ARGS
5175 IEMIMPL_AVX_PROLOGUE
5176
5177 vmovdqu ymm0, [A1]
5178 vmovdqu ymm1, [A2]
5179 lea T1, [.imm0 xWrtRIP]
5180 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5181 lea T1, [T1 + T0*2]
5182 call T1
5183 vmovdqu [A0], ymm0
5184
5185 IEMIMPL_AVX_EPILOGUE
5186 EPILOGUE_4_ARGS
5187 %assign bImm 0
5188 %rep 256
5189.imm %+ bImm:
5190 %1 ymm0, ymm0, ymm1, bImm
5191 ret
5192 int3
5193 %assign bImm bImm + 1
5194 %endrep
5195.immEnd: ; 256*8 == 0x800
5196dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5197dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5198ENDPROC iemAImpl_ %+ %1 %+ _u256
5199 %endif
5200%endmacro
5201
5202IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5203IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5204IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5205IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5206IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5207
5208
5209;;
5210; Need to move this as well somewhere better?
5211;
5212struc IEMPCMPISTRISRC
5213 .uSrc1 resd 4
5214 .uSrc2 resd 4
5215endstruc
5216
5217;;
5218; The pcmpistri instruction.
5219;
5220; @param A0 Pointer to the ECX register to store the result to (output).
5221; @param A1 Pointer to the EFLAGS register.
5222; @param A2 Pointer to the structure containing the source operands (input).
5223; @param A3 The 8-bit immediate
5224;
5225BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5226 PROLOGUE_4_ARGS
5227 IEMIMPL_SSE_PROLOGUE
5228
5229 movdqu xmm0, [A2 + IEMPCMPISTRISRC.uSrc1]
5230 movdqu xmm1, [A2 + IEMPCMPISTRISRC.uSrc2]
5231 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5232 lea T1, [.imm0 xWrtRIP]
5233 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5234 lea T1, [T1 + T0*2]
5235 call T1
5236
5237 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5238 mov [T2], ecx
5239
5240 IEMIMPL_SSE_EPILOGUE
5241 EPILOGUE_4_ARGS
5242 %assign bImm 0
5243 %rep 256
5244.imm %+ bImm:
5245 pcmpistri xmm0, xmm1, bImm
5246 ret
5247 int3
5248 %assign bImm bImm + 1
5249 %endrep
5250.immEnd: ; 256*8 == 0x800
5251dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5252dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5253ENDPROC iemAImpl_pcmpistri_u128
5254
5255
5256;;
5257; pinsrw instruction.
5258;
5259; @param A0 Pointer to the first media register size operand (input/output).
5260; @param A1 The 16 bit input operand (input).
5261; @param A2 The 8-bit immediate
5262;
5263BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5264 PROLOGUE_3_ARGS
5265 IEMIMPL_SSE_PROLOGUE
5266
5267 movq mm0, [A0]
5268 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5269 lea T1, [.imm0 xWrtRIP]
5270 lea T1, [T1 + T0]
5271 call T1
5272 movq [A0], mm0
5273
5274 IEMIMPL_SSE_EPILOGUE
5275 EPILOGUE_3_ARGS
5276 %assign bImm 0
5277 %rep 256
5278.imm %+ bImm:
5279 pinsrw mm0, A1_32, bImm
5280 ret
5281 %assign bImm bImm + 1
5282 %endrep
5283.immEnd: ; 256*5 == 0x500
5284dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5285dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5286ENDPROC iemAImpl_pinsrw_u64
5287
5288BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5289 PROLOGUE_3_ARGS
5290 IEMIMPL_SSE_PROLOGUE
5291
5292 movdqu xmm0, [A0]
5293 lea T1, [.imm0 xWrtRIP]
5294 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5295 lea T1, [T1 + T0*2]
5296 call T1
5297 movdqu [A0], xmm0
5298
5299 IEMIMPL_SSE_EPILOGUE
5300 EPILOGUE_3_ARGS
5301 %assign bImm 0
5302 %rep 256
5303.imm %+ bImm:
5304 pinsrw xmm0, A1_32, bImm
5305 ret
5306 %assign bImm bImm + 1
5307 %endrep
5308.immEnd: ; 256*6 == 0x600
5309dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5310dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5311ENDPROC iemAImpl_pinsrw_u128
5312
5313;;
5314; vpinsrw instruction.
5315;
5316; @param A0 Pointer to the first media register size operand (output).
5317; @param A1 Pointer to the source media register size operand (input).
5318; @param A2 The 16 bit input operand (input).
5319; @param A3 The 8-bit immediate
5320;
5321BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5322 PROLOGUE_4_ARGS
5323 IEMIMPL_SSE_PROLOGUE
5324
5325 movdqu xmm0, [A1]
5326 lea T1, [.imm0 xWrtRIP]
5327 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5328 lea T1, [T1 + T0*2]
5329 call T1
5330 movdqu [A0], xmm0
5331
5332 IEMIMPL_SSE_EPILOGUE
5333 EPILOGUE_4_ARGS
5334 %assign bImm 0
5335 %rep 256
5336.imm %+ bImm:
5337 vpinsrw xmm0, xmm0, A2_32, bImm
5338 ret
5339 %assign bImm bImm + 1
5340 %endrep
5341.immEnd: ; 256*6 == 0x600
5342dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5343dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5344ENDPROC iemAImpl_vpinsrw_u128
5345
5346
5347;;
5348; pextrw instruction.
5349;
5350; @param A0 Pointer to the 16bit output operand (output).
5351; @param A1 Pointer to the media register size operand (input).
5352; @param A2 The 8-bit immediate
5353;
5354BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5355 PROLOGUE_3_ARGS
5356 IEMIMPL_SSE_PROLOGUE
5357
5358 movq mm0, A1
5359 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5360 lea T1, [.imm0 xWrtRIP]
5361 lea T1, [T1 + T0]
5362 call T1
5363 mov word [A0], T0_16
5364
5365 IEMIMPL_SSE_EPILOGUE
5366 EPILOGUE_3_ARGS
5367 %assign bImm 0
5368 %rep 256
5369.imm %+ bImm:
5370 pextrw T0_32, mm0, bImm
5371 ret
5372 %assign bImm bImm + 1
5373 %endrep
5374.immEnd: ; 256*5 == 0x500
5375dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5376dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5377ENDPROC iemAImpl_pextrw_u64
5378
5379BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5380 PROLOGUE_3_ARGS
5381 IEMIMPL_SSE_PROLOGUE
5382
5383 movdqu xmm0, [A1]
5384 lea T1, [.imm0 xWrtRIP]
5385 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5386 lea T1, [T1 + T0*2]
5387 call T1
5388 mov word [A0], T0_16
5389
5390 IEMIMPL_SSE_EPILOGUE
5391 EPILOGUE_3_ARGS
5392 %assign bImm 0
5393 %rep 256
5394.imm %+ bImm:
5395 pextrw T0_32, xmm0, bImm
5396 ret
5397 %assign bImm bImm + 1
5398 %endrep
5399.immEnd: ; 256*6 == 0x600
5400dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5401dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5402ENDPROC iemAImpl_pextrw_u128
5403
5404;;
5405; vpextrw instruction.
5406;
5407; @param A0 Pointer to the 16bit output operand (output).
5408; @param A1 Pointer to the source media register size operand (input).
5409; @param A2 The 8-bit immediate
5410;
5411BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5412 PROLOGUE_3_ARGS
5413 IEMIMPL_SSE_PROLOGUE
5414
5415 movdqu xmm0, [A1]
5416 lea T1, [.imm0 xWrtRIP]
5417 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5418 lea T1, [T1 + T0*2]
5419 call T1
5420 mov word [A0], T0_16
5421
5422 IEMIMPL_SSE_EPILOGUE
5423 EPILOGUE_3_ARGS
5424 %assign bImm 0
5425 %rep 256
5426.imm %+ bImm:
5427 vpextrw T0_32, xmm0, bImm
5428 ret
5429 %assign bImm bImm + 1
5430 %endrep
5431.immEnd: ; 256*6 == 0x600
5432dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5433dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5434ENDPROC iemAImpl_vpextrw_u128
5435
5436
5437;;
5438; movmskp{s,d} SSE instruction template
5439;
5440; @param 1 The SSE instruction name.
5441; @param 2 The AVX instruction name.
5442;
5443; @param A0 Pointer to the output register (output/byte sized).
5444; @param A1 Pointer to the source media register size operand (input).
5445;
5446%macro IEMIMPL_MEDIA_MOVMSK_P 2
5447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5448 PROLOGUE_2_ARGS
5449 IEMIMPL_SSE_PROLOGUE
5450
5451 movdqu xmm0, [A1]
5452 %1 T0, xmm0
5453 mov byte [A0], T0_8
5454
5455 IEMIMPL_SSE_EPILOGUE
5456 EPILOGUE_2_ARGS
5457ENDPROC iemAImpl_ %+ %1 %+ _u128
5458
5459BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5460 PROLOGUE_2_ARGS
5461 IEMIMPL_AVX_PROLOGUE
5462
5463 movdqu xmm0, [A1]
5464 %2 T0, xmm0
5465 mov byte [A0], T0_8
5466
5467 IEMIMPL_AVX_EPILOGUE
5468 EPILOGUE_2_ARGS
5469ENDPROC iemAImpl_ %+ %2 %+ _u128
5470
5471BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5472 PROLOGUE_2_ARGS
5473 IEMIMPL_AVX_PROLOGUE
5474
5475 vmovdqu ymm0, [A1]
5476 %2 T0, ymm0
5477 mov byte [A0], T0_8
5478
5479 IEMIMPL_AVX_EPILOGUE
5480 EPILOGUE_2_ARGS
5481ENDPROC iemAImpl_ %+ %2 %+ _u256
5482%endmacro
5483
5484IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5485IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5486
5487
5488;;
5489; Restores the SSE MXCSR register with the original value.
5490;
5491; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5492; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5493; @param 2 Expression giving the address of the FXSTATE of the guest.
5494;
5495; @note Restores the stack pointer.
5496;
5497%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5498 sub xSP, 4
5499 stmxcsr [xSP]
5500 mov T0_32, [xSP]
5501 add xSP, 4
5502 ; Merge the status bits into the original MXCSR value.
5503 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5504 and T0_32, X86_MXCSR_XCPT_FLAGS
5505 or T0_32, T1_32
5506 mov [%1], T0_32
5507
5508 ldmxcsr [xSP]
5509 add xSP, 4
5510%endmacro
5511
5512
5513;;
5514; cvttsd2si instruction - 32-bit variant.
5515;
5516; @param A0 FPU context (FXSTATE or XSAVEAREA).
5517; @param A1 Where to return the MXCSR value.
5518; @param A2 Pointer to the result operand (output).
5519; @param A3 Pointer to the second operand (input).
5520;
5521BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5522 PROLOGUE_4_ARGS
5523 IEMIMPL_SSE_PROLOGUE
5524 SSE_LD_FXSTATE_MXCSR A0
5525
5526 cvttsd2si T0_32, [A3]
5527 mov dword [A2], T0_32
5528
5529 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5530 IEMIMPL_SSE_PROLOGUE
5531 EPILOGUE_4_ARGS
5532ENDPROC iemAImpl_cvttsd2si_i32_r64
5533
5534;;
5535; cvttsd2si instruction - 64-bit variant.
5536;
5537; @param A0 FPU context (FXSTATE or XSAVEAREA).
5538; @param A1 Where to return the MXCSR value.
5539; @param A2 Pointer to the result operand (output).
5540; @param A3 Pointer to the second operand (input).
5541;
5542BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5543 PROLOGUE_4_ARGS
5544 IEMIMPL_SSE_PROLOGUE
5545 SSE_LD_FXSTATE_MXCSR A0
5546
5547 cvttsd2si T0, [A3]
5548 mov qword [A2], T0
5549
5550 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5551 IEMIMPL_SSE_PROLOGUE
5552 EPILOGUE_4_ARGS
5553ENDPROC iemAImpl_cvttsd2si_i64_r64
5554
5555
5556;;
5557; cvtsd2si instruction - 32-bit variant.
5558;
5559; @param A0 FPU context (FXSTATE or XSAVEAREA).
5560; @param A1 Where to return the MXCSR value.
5561; @param A2 Pointer to the result operand (output).
5562; @param A3 Pointer to the second operand (input).
5563;
5564BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5565 PROLOGUE_4_ARGS
5566 IEMIMPL_SSE_PROLOGUE
5567 SSE_LD_FXSTATE_MXCSR A0
5568
5569 cvtsd2si T0_32, [A3]
5570 mov dword [A2], T0_32
5571
5572 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5573 IEMIMPL_SSE_PROLOGUE
5574 EPILOGUE_4_ARGS
5575ENDPROC iemAImpl_cvtsd2si_i32_r64
5576
5577;;
5578; cvtsd2si instruction - 64-bit variant.
5579;
5580; @param A0 FPU context (FXSTATE or XSAVEAREA).
5581; @param A1 Where to return the MXCSR value.
5582; @param A2 Pointer to the result operand (output).
5583; @param A3 Pointer to the second operand (input).
5584;
5585BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5586 PROLOGUE_4_ARGS
5587 IEMIMPL_SSE_PROLOGUE
5588 SSE_LD_FXSTATE_MXCSR A0
5589
5590 cvtsd2si T0, [A3]
5591 mov qword [A2], T0
5592
5593 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5594 IEMIMPL_SSE_PROLOGUE
5595 EPILOGUE_4_ARGS
5596ENDPROC iemAImpl_cvtsd2si_i64_r64
5597
5598
5599;;
5600; cvttss2si instruction - 32-bit variant.
5601;
5602; @param A0 FPU context (FXSTATE or XSAVEAREA).
5603; @param A1 Where to return the MXCSR value.
5604; @param A2 Pointer to the result operand (output).
5605; @param A3 Pointer to the second operand (input).
5606;
5607BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5608 PROLOGUE_4_ARGS
5609 IEMIMPL_SSE_PROLOGUE
5610 SSE_LD_FXSTATE_MXCSR A0
5611
5612 cvttss2si T0_32, [A3]
5613 mov dword [A2], T0_32
5614
5615 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5616 IEMIMPL_SSE_PROLOGUE
5617 EPILOGUE_4_ARGS
5618ENDPROC iemAImpl_cvttss2si_i32_r32
5619
5620;;
5621; cvttss2si instruction - 64-bit variant.
5622;
5623; @param A0 FPU context (FXSTATE or XSAVEAREA).
5624; @param A1 Where to return the MXCSR value.
5625; @param A2 Pointer to the result operand (output).
5626; @param A3 Pointer to the second operand (input).
5627;
5628BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5629 PROLOGUE_4_ARGS
5630 IEMIMPL_SSE_PROLOGUE
5631 SSE_LD_FXSTATE_MXCSR A0
5632
5633 cvttss2si T0, [A3]
5634 mov qword [A2], T0
5635
5636 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5637 IEMIMPL_SSE_PROLOGUE
5638 EPILOGUE_4_ARGS
5639ENDPROC iemAImpl_cvttss2si_i64_r32
5640
5641
5642;;
5643; cvtss2si instruction - 32-bit variant.
5644;
5645; @param A0 FPU context (FXSTATE or XSAVEAREA).
5646; @param A1 Where to return the MXCSR value.
5647; @param A2 Pointer to the result operand (output).
5648; @param A3 Pointer to the second operand (input).
5649;
5650BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5651 PROLOGUE_4_ARGS
5652 IEMIMPL_SSE_PROLOGUE
5653 SSE_LD_FXSTATE_MXCSR A0
5654
5655 cvtss2si T0_32, [A3]
5656 mov dword [A2], T0_32
5657
5658 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5659 IEMIMPL_SSE_PROLOGUE
5660 EPILOGUE_4_ARGS
5661ENDPROC iemAImpl_cvtss2si_i32_r32
5662
5663;;
5664; cvtss2si instruction - 64-bit variant.
5665;
5666; @param A0 FPU context (FXSTATE or XSAVEAREA).
5667; @param A1 Where to return the MXCSR value.
5668; @param A2 Pointer to the result operand (output).
5669; @param A3 Pointer to the second operand (input).
5670;
5671BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5672 PROLOGUE_4_ARGS
5673 IEMIMPL_SSE_PROLOGUE
5674 SSE_LD_FXSTATE_MXCSR A0
5675
5676 cvtss2si T0, [A3]
5677 mov qword [A2], T0
5678
5679 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5680 IEMIMPL_SSE_PROLOGUE
5681 EPILOGUE_4_ARGS
5682ENDPROC iemAImpl_cvtss2si_i64_r32
5683
5684
5685;;
5686; cvtsi2ss instruction - 32-bit variant.
5687;
5688; @param A0 FPU context (FXSTATE or XSAVEAREA).
5689; @param A1 Where to return the MXCSR value.
5690; @param A2 Pointer to the result operand (output).
5691; @param A3 Pointer to the second operand (input).
5692;
5693BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5694 PROLOGUE_4_ARGS
5695 IEMIMPL_SSE_PROLOGUE
5696 SSE_LD_FXSTATE_MXCSR A0
5697
5698 cvtsi2ss xmm0, dword [A3]
5699 movd dword [A2], xmm0
5700
5701 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5702 IEMIMPL_SSE_PROLOGUE
5703 EPILOGUE_4_ARGS
5704ENDPROC iemAImpl_cvtsi2ss_r32_i32
5705
5706;;
5707; cvtsi2ss instruction - 64-bit variant.
5708;
5709; @param A0 FPU context (FXSTATE or XSAVEAREA).
5710; @param A1 Where to return the MXCSR value.
5711; @param A2 Pointer to the result operand (output).
5712; @param A3 Pointer to the second operand (input).
5713;
5714BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5715 PROLOGUE_4_ARGS
5716 IEMIMPL_SSE_PROLOGUE
5717 SSE_LD_FXSTATE_MXCSR A0
5718
5719 cvtsi2ss xmm0, qword [A3]
5720 movd dword [A2], xmm0
5721
5722 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5723 IEMIMPL_SSE_PROLOGUE
5724 EPILOGUE_4_ARGS
5725ENDPROC iemAImpl_cvtsi2ss_r32_i64
5726
5727
5728;;
5729; cvtsi2sd instruction - 32-bit variant.
5730;
5731; @param A0 FPU context (FXSTATE or XSAVEAREA).
5732; @param A1 Where to return the MXCSR value.
5733; @param A2 Pointer to the result operand (output).
5734; @param A3 Pointer to the second operand (input).
5735;
5736BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5737 PROLOGUE_4_ARGS
5738 IEMIMPL_SSE_PROLOGUE
5739 SSE_LD_FXSTATE_MXCSR A0
5740
5741 cvtsi2sd xmm0, dword [A3]
5742 movq [A2], xmm0
5743
5744 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5745 IEMIMPL_SSE_PROLOGUE
5746 EPILOGUE_4_ARGS
5747ENDPROC iemAImpl_cvtsi2sd_r64_i32
5748
5749;;
5750; cvtsi2ss instruction - 64-bit variant.
5751;
5752; @param A0 FPU context (FXSTATE or XSAVEAREA).
5753; @param A1 Where to return the MXCSR value.
5754; @param A2 Pointer to the result operand (output).
5755; @param A3 Pointer to the second operand (input).
5756;
5757BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5758 PROLOGUE_4_ARGS
5759 IEMIMPL_SSE_PROLOGUE
5760 SSE_LD_FXSTATE_MXCSR A0
5761
5762 cvtsi2sd xmm0, qword [A3]
5763 movq [A2], xmm0
5764
5765 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5766 IEMIMPL_SSE_PROLOGUE
5767 EPILOGUE_4_ARGS
5768ENDPROC iemAImpl_cvtsi2sd_r64_i64
5769
5770
5771;;
5772; Initialize the SSE MXCSR register using the guest value partially to
5773; account for rounding mode.
5774;
5775; @uses 4 bytes of stack to save the original value, T0.
5776; @param 1 Expression giving the address of the MXCSR register of the guest.
5777;
5778%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5779 sub xSP, 4
5780
5781 stmxcsr [xSP]
5782 mov T0_32, [%1]
5783 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5784 or T0_32, X86_MXCSR_XCPT_MASK
5785 sub xSP, 4
5786 mov [xSP], T0_32
5787 ldmxcsr [xSP]
5788 add xSP, 4
5789%endmacro
5790
5791
5792;;
5793; Restores the SSE MXCSR register with the original value.
5794;
5795; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5796; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5797;
5798; @note Restores the stack pointer.
5799;
5800%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5801 sub xSP, 4
5802 stmxcsr [xSP]
5803 mov T0_32, [xSP]
5804 add xSP, 4
5805 ; Merge the status bits into the original MXCSR value.
5806 mov T1_32, [%1]
5807 and T0_32, X86_MXCSR_XCPT_FLAGS
5808 or T0_32, T1_32
5809 mov [%1], T0_32
5810
5811 ldmxcsr [xSP]
5812 add xSP, 4
5813%endmacro
5814
5815
5816;
5817; UCOMISS (SSE)
5818;
5819; @param A0 Pointer to the MXCSR value (input/output).
5820; @param A1 Pointer to the EFLAGS value (input/output).
5821; @param A2 Pointer to the first source operand (aka readonly destination).
5822; @param A3 Pointer to the second source operand.
5823;
5824BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 12
5825 PROLOGUE_3_ARGS
5826 IEMIMPL_SSE_PROLOGUE
5827 SSE_LD_FXSTATE_MXCSR_ONLY A0
5828
5829 movdqu xmm0, [A2]
5830 movdqu xmm1, [A3]
5831 ucomiss xmm0, xmm1
5832 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5833
5834 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5835 IEMIMPL_SSE_EPILOGUE
5836 EPILOGUE_3_ARGS
5837ENDPROC iemAImpl_ucomiss_u128
5838
5839BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 12
5840 PROLOGUE_3_ARGS
5841 IEMIMPL_SSE_PROLOGUE
5842 SSE_LD_FXSTATE_MXCSR_ONLY A0
5843
5844 movdqu xmm0, [A2]
5845 movdqu xmm1, [A3]
5846 vucomiss xmm0, xmm1
5847 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5848
5849 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5850 IEMIMPL_SSE_EPILOGUE
5851 EPILOGUE_3_ARGS
5852ENDPROC iemAImpl_vucomiss_u128
5853
5854
5855;
5856; UCOMISD (SSE)
5857;
5858; @param A0 Pointer to the MXCSR value (input/output).
5859; @param A1 Pointer to the EFLAGS value (input/output).
5860; @param A2 Pointer to the first source operand (aka readonly destination).
5861; @param A3 Pointer to the second source operand.
5862;
5863BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 12
5864 PROLOGUE_3_ARGS
5865 IEMIMPL_SSE_PROLOGUE
5866 SSE_LD_FXSTATE_MXCSR_ONLY A0
5867
5868 movdqu xmm0, [A2]
5869 movdqu xmm1, [A3]
5870 ucomisd xmm0, xmm1
5871 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5872
5873 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5874 IEMIMPL_SSE_EPILOGUE
5875 EPILOGUE_3_ARGS
5876ENDPROC iemAImpl_ucomisd_u128
5877
5878BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 12
5879 PROLOGUE_3_ARGS
5880 IEMIMPL_SSE_PROLOGUE
5881 SSE_LD_FXSTATE_MXCSR_ONLY A0
5882
5883 movdqu xmm0, [A2]
5884 movdqu xmm1, [A3]
5885 vucomisd xmm0, xmm1
5886 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5887
5888 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5889 IEMIMPL_SSE_EPILOGUE
5890 EPILOGUE_3_ARGS
5891ENDPROC iemAImpl_vucomisd_u128
5892
5893;
5894; COMISS (SSE)
5895;
5896; @param A0 Pointer to the MXCSR value (input/output).
5897; @param A1 Pointer to the EFLAGS value (input/output).
5898; @param A2 Pointer to the first source operand (aka readonly destination).
5899; @param A3 Pointer to the second source operand.
5900;
5901BEGINPROC_FASTCALL iemAImpl_comiss_u128, 12
5902 PROLOGUE_3_ARGS
5903 IEMIMPL_SSE_PROLOGUE
5904 SSE_LD_FXSTATE_MXCSR_ONLY A0
5905
5906 movdqu xmm0, [A2]
5907 movdqu xmm1, [A3]
5908 comiss xmm0, xmm1
5909 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5910
5911 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5912 IEMIMPL_SSE_EPILOGUE
5913 EPILOGUE_3_ARGS
5914ENDPROC iemAImpl_comiss_u128
5915
5916BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 12
5917 PROLOGUE_3_ARGS
5918 IEMIMPL_SSE_PROLOGUE
5919 SSE_LD_FXSTATE_MXCSR_ONLY A0
5920
5921 movdqu xmm0, [A2]
5922 movdqu xmm1, [A3]
5923 vcomiss xmm0, xmm1
5924 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5925
5926 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5927 IEMIMPL_SSE_EPILOGUE
5928 EPILOGUE_3_ARGS
5929ENDPROC iemAImpl_vcomiss_u128
5930
5931
5932;
5933; COMISD (SSE)
5934;
5935; @param A0 Pointer to the MXCSR value (input/output).
5936; @param A1 Pointer to the EFLAGS value (input/output).
5937; @param A2 Pointer to the first source operand (aka readonly destination).
5938; @param A3 Pointer to the second source operand.
5939;
5940BEGINPROC_FASTCALL iemAImpl_comisd_u128, 12
5941 PROLOGUE_3_ARGS
5942 IEMIMPL_SSE_PROLOGUE
5943 SSE_LD_FXSTATE_MXCSR_ONLY A0
5944
5945 movdqu xmm0, [A2]
5946 movdqu xmm1, [A3]
5947 comisd xmm0, xmm1
5948 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5949
5950 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5951 IEMIMPL_SSE_EPILOGUE
5952 EPILOGUE_3_ARGS
5953ENDPROC iemAImpl_comisd_u128
5954
5955BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 12
5956 PROLOGUE_3_ARGS
5957 IEMIMPL_SSE_PROLOGUE
5958 SSE_LD_FXSTATE_MXCSR_ONLY A0
5959
5960 movdqu xmm0, [A2]
5961 movdqu xmm1, [A3]
5962 vcomisd xmm0, xmm1
5963 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5964
5965 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5966 IEMIMPL_SSE_EPILOGUE
5967 EPILOGUE_3_ARGS
5968ENDPROC iemAImpl_vcomisd_u128
5969
5970
5971;;
5972; Need to move this as well somewhere better?
5973;
5974struc IEMMEDIAF2XMMSRC
5975 .uSrc1 resd 4
5976 .uSrc2 resd 4
5977endstruc
5978
5979
5980;
5981; CMPPS (SSE)
5982;
5983; @param A0 Pointer to the MXCSR value (input/output).
5984; @param A1 Pointer to the first media register size operand (output).
5985; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
5986; @param A3 The 8-bit immediate (input).
5987;
5988BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
5989 PROLOGUE_4_ARGS
5990 IEMIMPL_SSE_PROLOGUE
5991 SSE_LD_FXSTATE_MXCSR_ONLY A0
5992
5993 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
5994 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
5995 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
5996 lea T1, [.imm0 xWrtRIP]
5997 lea T1, [T1 + T0]
5998 call T1
5999 movdqu [A1], xmm0
6000
6001 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6002 IEMIMPL_SSE_EPILOGUE
6003 EPILOGUE_4_ARGS
6004 %assign bImm 0
6005 %rep 256
6006.imm %+ bImm:
6007 cmpps xmm0, xmm1, bImm
6008 ret
6009 %assign bImm bImm + 1
6010 %endrep
6011.immEnd: ; 256*5 == 0x500
6012dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6013dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6014ENDPROC iemAImpl_cmpps_u128
6015
6016;;
6017; SSE instructions with 8-bit immediates of the form
6018; xxx xmm1, xmm2, imm8.
6019; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6020; register.
6021;
6022; @param 1 The instruction name.
6023;
6024; @param A0 Pointer to the MXCSR value (input/output).
6025; @param A1 Pointer to the first media register size operand (output).
6026; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6027; @param A3 The 8-bit immediate (input).
6028;
6029%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6031 PROLOGUE_4_ARGS
6032 IEMIMPL_SSE_PROLOGUE
6033 SSE_LD_FXSTATE_MXCSR_ONLY A0
6034
6035 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6036 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6037 lea T1, [.imm0 xWrtRIP]
6038 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6039 lea T1, [T1 + T0*2]
6040 call T1
6041 movdqu [A1], xmm0
6042
6043 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6044 IEMIMPL_SSE_EPILOGUE
6045 EPILOGUE_4_ARGS
6046 %assign bImm 0
6047 %rep 256
6048.imm %+ bImm:
6049 %1 xmm0, xmm1, bImm
6050 ret
6051 %assign bImm bImm + 1
6052 %endrep
6053.immEnd: ; 256*6 == 0x600
6054dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6055dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6056ENDPROC iemAImpl_ %+ %1 %+ _u128
6057%endmacro
6058
6059IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6060IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6061IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use