VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 99220

Last change on this file since 99220 was 98921, checked in by vboxsync, 15 months ago

VMM/IEM: Started implementing the dpps/dppd instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 672.7 KB
Line 
1/* $Id: IEMAllAImplC.cpp 98921 2023-03-12 16:54:45Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_ZERO(pr80Val))
6203 {
6204 pFpuResTwo->r80Result1 = *pr80Val;
6205 pFpuResTwo->r80Result2 = g_ar80One[0];
6206 }
6207 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6208 {
6209 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6210 {
6211 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6212 pFpuResTwo->r80Result1 = *pr80Val;
6213 }
6214 else
6215 {
6216 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6217 {
6218 pFpuResTwo->r80Result1 = *pr80Val;
6219 }
6220 else
6221 {
6222 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6223 }
6224
6225 pFpuResTwo->r80Result2 = g_ar80One[0];
6226
6227 fFsw |= X86_FSW_PE;
6228 if (!(fFcw & X86_FCW_PM))
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_IE;
6235 if (!(fFcw & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6237 }
6238
6239 pFpuResTwo->FSW = fFsw;
6240}
6241#endif /* IEM_WITHOUT_ASSEMBLY */
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6249{
6250 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6251}
6252
6253#ifdef IEM_WITHOUT_ASSEMBLY
6254
6255static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6256{
6257 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6258 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6259 extFloat80_t v;
6260 (void)fFcw;
6261
6262 v = extF80_sin(x, &SoftState);
6263
6264 iemFpuSoftF80ToIprt(pr80Result, v);
6265
6266 return fFsw;
6267}
6268
6269IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6270{
6271 uint16_t const fFcw = pFpuState->FCW;
6272 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6273
6274 if (RTFLOAT80U_IS_ZERO(pr80Val))
6275 {
6276 pFpuRes->r80Result = *pr80Val;
6277 }
6278 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6279 {
6280 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6281 {
6282 fFsw |= X86_FSW_C2;
6283 pFpuRes->r80Result = *pr80Val;
6284 }
6285 else
6286 {
6287 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6288 {
6289 pFpuRes->r80Result = *pr80Val;
6290 }
6291 else
6292 {
6293 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6294 }
6295 fFsw |= X86_FSW_PE;
6296 if (!(fFcw & X86_FCW_PM))
6297 fFsw |= X86_FSW_ES | X86_FSW_B;
6298 }
6299 }
6300 else if (RTFLOAT80U_IS_INF(pr80Val))
6301 {
6302 fFsw |= X86_FSW_IE;
6303 if (!(fFcw & X86_FCW_IM))
6304 {
6305 fFsw |= X86_FSW_ES | X86_FSW_B;
6306 pFpuRes->r80Result = *pr80Val;
6307 }
6308 else
6309 {
6310 pFpuRes->r80Result = g_r80Indefinite;
6311 }
6312 }
6313 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6314 {
6315 fFsw |= X86_FSW_DE;
6316
6317 if (fFcw & X86_FCW_DM)
6318 {
6319 if (fFcw & X86_FCW_UM)
6320 {
6321 pFpuRes->r80Result = *pr80Val;
6322 }
6323 else
6324 {
6325 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6326 uint64_t uMantissa = pr80Val->s.uMantissa;
6327 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6328
6329 uExponent = 64 - uExponent;
6330 uMantissa <<= uExponent;
6331 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6332
6333 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6334 pFpuRes->r80Result.s.uMantissa = uMantissa;
6335 pFpuRes->r80Result.s.uExponent = uExponent;
6336 }
6337
6338 fFsw |= X86_FSW_UE | X86_FSW_PE;
6339
6340 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6341 {
6342 /* All the exceptions are masked. */
6343 }
6344 else
6345 {
6346 fFsw |= X86_FSW_ES | X86_FSW_B;
6347 }
6348 }
6349 else
6350 {
6351 pFpuRes->r80Result = *pr80Val;
6352
6353 fFsw |= X86_FSW_ES | X86_FSW_B;
6354 }
6355 }
6356 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6357 {
6358 pFpuRes->r80Result = *pr80Val;
6359 fFsw |= X86_FSW_DE;
6360
6361 if (fFcw & X86_FCW_DM)
6362 {
6363 if (fFcw & X86_FCW_PM)
6364 {
6365 fFsw |= X86_FSW_PE;
6366 }
6367 else
6368 {
6369 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6370 }
6371
6372 pFpuRes->r80Result.sj64.uExponent = 1;
6373 }
6374 else
6375 {
6376 fFsw |= X86_FSW_ES | X86_FSW_B;
6377 }
6378 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6379 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6380 {
6381 pFpuRes->r80Result = *pr80Val;
6382 } else {
6383 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6384 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6385 && (fFcw & X86_FCW_IM))
6386 pFpuRes->r80Result = g_r80Indefinite;
6387 else
6388 {
6389 pFpuRes->r80Result = *pr80Val;
6390 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6391 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6392 }
6393
6394 fFsw |= X86_FSW_IE;
6395 if (!(fFcw & X86_FCW_IM))
6396 fFsw |= X86_FSW_ES | X86_FSW_B;
6397 }
6398
6399 pFpuRes->FSW = fFsw;
6400}
6401#endif /* IEM_WITHOUT_ASSEMBLY */
6402
6403IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6404{
6405 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6406}
6407
6408IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6409{
6410 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6411}
6412
6413#ifdef IEM_WITHOUT_ASSEMBLY
6414
6415static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6416{
6417 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6418 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6419 extFloat80_t v;
6420 (void)fFcw;
6421
6422 v = extF80_cos(x, &SoftState);
6423
6424 iemFpuSoftF80ToIprt(pr80Result, v);
6425
6426 return fFsw;
6427}
6428
6429IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6430{
6431 uint16_t const fFcw = pFpuState->FCW;
6432 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6433
6434 if (RTFLOAT80U_IS_ZERO(pr80Val))
6435 {
6436 pFpuRes->r80Result = g_ar80One[0];
6437 }
6438 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6439 {
6440 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6441 {
6442 fFsw |= X86_FSW_C2;
6443 pFpuRes->r80Result = *pr80Val;
6444 }
6445 else
6446 {
6447 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6448 {
6449 pFpuRes->r80Result = g_ar80One[0];
6450
6451 }
6452 else
6453 {
6454 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6455 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6456 }
6457 fFsw |= X86_FSW_PE;
6458 if (!(fFcw & X86_FCW_PM))
6459 fFsw |= X86_FSW_ES | X86_FSW_B;
6460 }
6461 }
6462 else if (RTFLOAT80U_IS_INF(pr80Val))
6463 {
6464 fFsw |= X86_FSW_IE;
6465 if (!(fFcw & X86_FCW_IM))
6466 {
6467 fFsw |= X86_FSW_ES | X86_FSW_B;
6468 pFpuRes->r80Result = *pr80Val;
6469 }
6470 else
6471 {
6472 pFpuRes->r80Result = g_r80Indefinite;
6473 }
6474 }
6475 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6476 {
6477 fFsw |= X86_FSW_DE;
6478
6479 if (fFcw & X86_FCW_DM)
6480 {
6481 pFpuRes->r80Result = g_ar80One[0];
6482
6483 if (fFcw & X86_FCW_PM)
6484 {
6485 fFsw |= X86_FSW_PE;
6486 }
6487 else
6488 {
6489 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else
6493 {
6494 pFpuRes->r80Result = *pr80Val;
6495 fFsw |= X86_FSW_ES | X86_FSW_B;
6496 }
6497 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6498 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6499 {
6500 pFpuRes->r80Result = *pr80Val;
6501 } else {
6502 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6503 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6504 && (fFcw & X86_FCW_IM))
6505 pFpuRes->r80Result = g_r80Indefinite;
6506 else
6507 {
6508 pFpuRes->r80Result = *pr80Val;
6509 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6510 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6511 }
6512
6513 fFsw |= X86_FSW_IE;
6514 if (!(fFcw & X86_FCW_IM))
6515 fFsw |= X86_FSW_ES | X86_FSW_B;
6516 }
6517
6518 pFpuRes->FSW = fFsw;
6519}
6520#endif /* IEM_WITHOUT_ASSEMBLY */
6521
6522IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6523{
6524 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6525}
6526
6527IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6528{
6529 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6530}
6531
6532#ifdef IEM_WITHOUT_ASSEMBLY
6533
6534static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6535{
6536 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6537 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6538 extFloat80_t r80Sin, r80Cos;
6539 (void)fFcw;
6540
6541 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6542
6543 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6544 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6545
6546 return fFsw;
6547}
6548
6549IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6550{
6551 uint16_t const fFcw = pFpuState->FCW;
6552 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6553
6554 if (RTFLOAT80U_IS_ZERO(pr80Val))
6555 {
6556 pFpuResTwo->r80Result1 = *pr80Val;
6557 pFpuResTwo->r80Result2 = g_ar80One[0];
6558 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6559 }
6560 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6561 {
6562 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6563 {
6564 fFsw |= X86_FSW_C2;
6565
6566 if (fFcw & X86_FCW_IM)
6567 {
6568 pFpuResTwo->r80Result1 = g_r80Indefinite;
6569 }
6570 else
6571 {
6572 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6573 }
6574
6575 pFpuResTwo->r80Result2 = *pr80Val;
6576 }
6577 else
6578 {
6579 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6580
6581 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6582 {
6583 pFpuResTwo->r80Result1 = *pr80Val;
6584 pFpuResTwo->r80Result2 = g_ar80One[0];
6585 }
6586 else
6587 {
6588 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6589 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6590 }
6591 fFsw |= X86_FSW_PE;
6592 if (!(fFcw & X86_FCW_PM))
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6597 {
6598 fFsw |= X86_FSW_DE;
6599
6600 if (fFcw & X86_FCW_DM)
6601 {
6602 pFpuResTwo->r80Result1 = *pr80Val;
6603 pFpuResTwo->r80Result2 = g_ar80One[0];
6604 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6605
6606 if (fFcw & X86_FCW_PM)
6607 {
6608 fFsw |= X86_FSW_PE;
6609 }
6610 else
6611 {
6612 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6613 }
6614
6615 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6616 }
6617 else
6618 {
6619 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6620 pFpuResTwo->r80Result2 = *pr80Val;
6621 fFsw |= X86_FSW_ES | X86_FSW_B;
6622 }
6623 }
6624 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6625 {
6626 fFsw |= X86_FSW_DE;
6627
6628 if (fFcw & X86_FCW_DM)
6629 {
6630 pFpuResTwo->r80Result2 = g_ar80One[0];
6631
6632 if (fFcw & X86_FCW_UM)
6633 {
6634 pFpuResTwo->r80Result1 = *pr80Val;
6635 }
6636 else
6637 {
6638 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6639 uint64_t uMantissa = pr80Val->s.uMantissa;
6640 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6641
6642 uExponent = 64 - uExponent;
6643 uMantissa <<= uExponent;
6644 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6645
6646 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6647 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6648 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6649 }
6650
6651 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6652 fFsw |= X86_FSW_UE | X86_FSW_PE;
6653
6654 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6655 {
6656 /* All the exceptions are masked. */
6657 }
6658 else
6659 {
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662 }
6663 else
6664 {
6665 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6666 pFpuResTwo->r80Result2 = *pr80Val;
6667 fFsw |= X86_FSW_ES | X86_FSW_B;
6668 }
6669 }
6670 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6671 {
6672 pFpuResTwo->r80Result1 = *pr80Val;
6673 pFpuResTwo->r80Result2 = *pr80Val;
6674 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6675 }
6676 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6677 {
6678 if (fFcw & X86_FCW_IM)
6679 {
6680 pFpuResTwo->r80Result1 = g_r80Indefinite;
6681 pFpuResTwo->r80Result2 = g_r80Indefinite;
6682 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6683 }
6684 else
6685 {
6686 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6687 pFpuResTwo->r80Result2 = *pr80Val;
6688 }
6689
6690 fFsw |= X86_FSW_IE;
6691 if (!(fFcw & X86_FCW_IM))
6692 fFsw |= X86_FSW_ES | X86_FSW_B;
6693 }
6694 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6695 {
6696 pFpuResTwo->r80Result1 = *pr80Val;
6697 pFpuResTwo->r80Result2 = *pr80Val;
6698
6699 if (fFcw & X86_FCW_IM)
6700 {
6701 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6702 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else
6706 {
6707 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6708 pFpuResTwo->r80Result2 = *pr80Val;
6709 }
6710
6711 fFsw |= X86_FSW_IE;
6712 if (!(fFcw & X86_FCW_IM))
6713 fFsw |= X86_FSW_ES | X86_FSW_B;
6714 }
6715 else if (RTFLOAT80U_IS_INF(pr80Val))
6716 {
6717 if (fFcw & X86_FCW_IM)
6718 {
6719 pFpuResTwo->r80Result1 = g_r80Indefinite;
6720 pFpuResTwo->r80Result2 = g_r80Indefinite;
6721 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6722 }
6723 else
6724 {
6725 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6726 pFpuResTwo->r80Result2 = *pr80Val;
6727 }
6728
6729 fFsw |= X86_FSW_IE;
6730 if (!(fFcw & X86_FCW_IM))
6731 fFsw |= X86_FSW_ES | X86_FSW_B;
6732 }
6733
6734 pFpuResTwo->FSW = fFsw;
6735}
6736#endif /* IEM_WITHOUT_ASSEMBLY */
6737
6738IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6739{
6740 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6741}
6742
6743IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6744{
6745 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6746}
6747
6748#ifdef IEM_WITHOUT_ASSEMBLY
6749
6750
6751/*********************************************************************************************************************************
6752* x87 FPU Compare and Testing Operations *
6753*********************************************************************************************************************************/
6754
6755IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6756{
6757 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6758
6759 if (RTFLOAT80U_IS_ZERO(pr80Val))
6760 fFsw |= X86_FSW_C3;
6761 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6762 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6763 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6764 {
6765 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6766 if (!(pFpuState->FCW & X86_FCW_DM))
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 else
6770 {
6771 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6772 if (!(pFpuState->FCW & X86_FCW_IM))
6773 fFsw |= X86_FSW_ES | X86_FSW_B;
6774 }
6775
6776 *pu16Fsw = fFsw;
6777}
6778
6779
6780IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6781{
6782 RT_NOREF(pFpuState);
6783 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6784
6785 /* C1 = sign bit (always, even if empty Intel says). */
6786 if (pr80Val->s.fSign)
6787 fFsw |= X86_FSW_C1;
6788
6789 /* Classify the value in C0, C2, C3. */
6790 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6791 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6792 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6793 fFsw |= X86_FSW_C2;
6794 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6795 fFsw |= X86_FSW_C3;
6796 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6797 fFsw |= X86_FSW_C0;
6798 else if (RTFLOAT80U_IS_INF(pr80Val))
6799 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6800 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6801 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6802 /* whatever else: 0 */
6803
6804 *pu16Fsw = fFsw;
6805}
6806
6807
6808/**
6809 * Worker for fcom, fucom, and friends.
6810 */
6811static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6812 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6813{
6814 /*
6815 * Unpack the values.
6816 */
6817 bool const fSign1 = pr80Val1->s.fSign;
6818 int32_t iExponent1 = pr80Val1->s.uExponent;
6819 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6820
6821 bool const fSign2 = pr80Val2->s.fSign;
6822 int32_t iExponent2 = pr80Val2->s.uExponent;
6823 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6824
6825 /*
6826 * Check for invalid inputs.
6827 */
6828 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6829 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6830 {
6831 if (!(fFcw & X86_FCW_IM))
6832 fFsw |= X86_FSW_ES | X86_FSW_B;
6833 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6834 }
6835
6836 /*
6837 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6838 */
6839 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6840 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6841 {
6842 if ( fIeOnAllNaNs
6843 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6844 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6845 {
6846 fFsw |= X86_FSW_IE;
6847 if (!(fFcw & X86_FCW_IM))
6848 fFsw |= X86_FSW_ES | X86_FSW_B;
6849 }
6850 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6851 }
6852
6853 /*
6854 * Normalize the values.
6855 */
6856 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6857 {
6858 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6859 iExponent1 = 1;
6860 else
6861 {
6862 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6863 uMantissa1 <<= iExponent1;
6864 iExponent1 = 1 - iExponent1;
6865 }
6866 fFsw |= X86_FSW_DE;
6867 if (!(fFcw & X86_FCW_DM))
6868 fFsw |= X86_FSW_ES | X86_FSW_B;
6869 }
6870
6871 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6872 {
6873 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6874 iExponent2 = 1;
6875 else
6876 {
6877 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6878 uMantissa2 <<= iExponent2;
6879 iExponent2 = 1 - iExponent2;
6880 }
6881 fFsw |= X86_FSW_DE;
6882 if (!(fFcw & X86_FCW_DM))
6883 fFsw |= X86_FSW_ES | X86_FSW_B;
6884 }
6885
6886 /*
6887 * Test if equal (val1 == val2):
6888 */
6889 if ( uMantissa1 == uMantissa2
6890 && iExponent1 == iExponent2
6891 && ( fSign1 == fSign2
6892 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6893 fFsw |= X86_FSW_C3;
6894 /*
6895 * Test if less than (val1 < val2):
6896 */
6897 else if (fSign1 && !fSign2)
6898 fFsw |= X86_FSW_C0;
6899 else if (fSign1 == fSign2)
6900 {
6901 /* Zeros are problematic, however at the most one can be zero here. */
6902 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6903 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6904 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6905 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6906
6907 if ( fSign1
6908 ^ ( iExponent1 < iExponent2
6909 || ( iExponent1 == iExponent2
6910 && uMantissa1 < uMantissa2 ) ) )
6911 fFsw |= X86_FSW_C0;
6912 }
6913 /* else: No flags set if greater. */
6914
6915 return fFsw;
6916}
6917
6918
6919IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6920 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6921{
6922 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6923}
6924
6925
6926
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6930{
6931 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6932}
6933
6934
6935IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6936 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6937{
6938 RTFLOAT80U r80Val2;
6939 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6940 Assert(!fFsw || fFsw == X86_FSW_DE);
6941 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6942 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6943 {
6944 if (!(pFpuState->FCW & X86_FCW_DM))
6945 fFsw |= X86_FSW_ES | X86_FSW_B;
6946 *pfFsw |= fFsw;
6947 }
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6953{
6954 RTFLOAT80U r80Val2;
6955 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6956 Assert(!fFsw || fFsw == X86_FSW_DE);
6957 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6958 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6959 {
6960 if (!(pFpuState->FCW & X86_FCW_DM))
6961 fFsw |= X86_FSW_ES | X86_FSW_B;
6962 *pfFsw |= fFsw;
6963 }
6964}
6965
6966
6967IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6968 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6969{
6970 RTFLOAT80U r80Val2;
6971 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6972 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6977 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6978{
6979 RTFLOAT80U r80Val2;
6980 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6981 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6982}
6983
6984
6985/**
6986 * Worker for fcomi & fucomi.
6987 */
6988static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6989 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6990{
6991 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6992 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6993 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6994 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6995
6996 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6997 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6998 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6999}
7000
7001
7002IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7003 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7004{
7005 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7006}
7007
7008
7009IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7010 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7011{
7012 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7013}
7014
7015
7016/*********************************************************************************************************************************
7017* x87 FPU Other Operations *
7018*********************************************************************************************************************************/
7019
7020/**
7021 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7022 */
7023static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7024{
7025 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7026 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7027 true /*exact / generate #PE */, &SoftState));
7028 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7029}
7030
7031
7032IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7033{
7034 uint16_t const fFcw = pFpuState->FCW;
7035 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7036
7037 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7038 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7039 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7040 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7041 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7042 || RTFLOAT80U_IS_INF(pr80Val))
7043 pFpuRes->r80Result = *pr80Val;
7044 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7045 {
7046 fFsw |= X86_FSW_DE;
7047 if (fFcw & X86_FCW_DM)
7048 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7049 else
7050 {
7051 pFpuRes->r80Result = *pr80Val;
7052 fFsw |= X86_FSW_ES | X86_FSW_B;
7053 }
7054 }
7055 else
7056 {
7057 if (fFcw & X86_FCW_IM)
7058 {
7059 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7060 pFpuRes->r80Result = g_r80Indefinite;
7061 else
7062 {
7063 pFpuRes->r80Result = *pr80Val;
7064 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7065 }
7066 }
7067 else
7068 {
7069 pFpuRes->r80Result = *pr80Val;
7070 fFsw |= X86_FSW_ES | X86_FSW_B;
7071 }
7072 fFsw |= X86_FSW_IE;
7073 }
7074 pFpuRes->FSW = fFsw;
7075}
7076
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7079 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7080{
7081 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7082 it does everything we need it to do. */
7083 uint16_t const fFcw = pFpuState->FCW;
7084 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7085 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7086 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7087 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7088}
7089
7090
7091/**
7092 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7093 */
7094static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7095{
7096 Assert(!pr80Val->s.fSign);
7097 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7098 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7099 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7104{
7105 uint16_t const fFcw = pFpuState->FCW;
7106 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7107
7108 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7109 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7110 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7111 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7112 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7113 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7114 pFpuRes->r80Result = *pr80Val;
7115 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7116 {
7117 fFsw |= X86_FSW_DE;
7118 if (fFcw & X86_FCW_DM)
7119 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7120 else
7121 {
7122 pFpuRes->r80Result = *pr80Val;
7123 fFsw |= X86_FSW_ES | X86_FSW_B;
7124 }
7125 }
7126 else
7127 {
7128 if (fFcw & X86_FCW_IM)
7129 {
7130 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7131 pFpuRes->r80Result = g_r80Indefinite;
7132 else
7133 {
7134 pFpuRes->r80Result = *pr80Val;
7135 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7136 }
7137 }
7138 else
7139 {
7140 pFpuRes->r80Result = *pr80Val;
7141 fFsw |= X86_FSW_ES | X86_FSW_B;
7142 }
7143 fFsw |= X86_FSW_IE;
7144 }
7145 pFpuRes->FSW = fFsw;
7146}
7147
7148
7149/**
7150 * @code{.unparsed}
7151 * x x * ln2
7152 * f(x) = 2 - 1 = e - 1
7153 *
7154 * @endcode
7155 *
7156 * We can approximate e^x by a Taylor/Maclaurin series (see
7157 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7158 * @code{.unparsed}
7159 * n 0 1 2 3 4
7160 * inf x x x x x x
7161 * SUM ----- = --- + --- + --- + --- + --- + ...
7162 * n=0 n! 0! 1! 2! 3! 4!
7163 *
7164 * 2 3 4
7165 * x x x
7166 * = 1 + x + --- + --- + --- + ...
7167 * 2! 3! 4!
7168 * @endcode
7169 *
7170 * Given z = x * ln2, we get:
7171 * @code{.unparsed}
7172 * 2 3 4 n
7173 * z z z z z
7174 * e - 1 = z + --- + --- + --- + ... + ---
7175 * 2! 3! 4! n!
7176 * @endcode
7177 *
7178 * Wanting to use Horner's method, we move one z outside and get:
7179 * @code{.unparsed}
7180 * 2 3 (n-1)
7181 * z z z z
7182 * = z ( 1 + --- + --- + --- + ... + ------- )
7183 * 2! 3! 4! n!
7184 * @endcode
7185 *
7186 * The constants we need for using Horner's methods are 1 and 1 / n!.
7187 *
7188 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7189 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7190 * and can approximate it to be 1.0. For a visual demonstration of this
7191 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7192 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7193 *
7194 *
7195 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7196 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7197 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7198 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7199 * blocks). (The one bit difference is probably an implicit one missing from
7200 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7201 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7202 * exponent.
7203 *
7204 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7205 * successfully reproduced the exact results from an Intel 10980XE, there is
7206 * always a portition of rounding differences. Not going to spend too much time
7207 * on getting this 100% the same, at least not now.
7208 *
7209 * P.S. If someone are really curious about 8087 and its contstants:
7210 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7211 *
7212 *
7213 * @param pr80Val The exponent value (x), less than 1.0, greater than
7214 * -1.0 and not zero. This can be a normal, denormal
7215 * or pseudo-denormal value.
7216 * @param pr80Result Where to return the result.
7217 * @param fFcw FPU control word.
7218 * @param fFsw FPU status word.
7219 */
7220static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7221{
7222 /* As mentioned above, we can skip the expensive polynomial calculation
7223 as it will be close enough to 1.0 that it makes no difference.
7224
7225 The cutoff point for intel 10980XE is exponents >= -69. Intel
7226 also seems to be using a 67-bit or 68-bit constant value, and we get
7227 a smattering of rounding differences if we go for higher precision. */
7228 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7229 {
7230 RTUINT256U u256;
7231 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7232 u256.QWords.qw0 |= 1; /* force #PE */
7233 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7234 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7235 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7236 : 1 - RTFLOAT80U_EXP_BIAS,
7237 fFcw, fFsw);
7238 }
7239 else
7240 {
7241#ifdef IEM_WITH_FLOAT128_FOR_FPU
7242 /* This approach is not good enough for small values, we end up with zero. */
7243 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7244 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7245 _Float128 rd128Result = powf128(2.0L, rd128Val);
7246 rd128Result -= 1.0L;
7247 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7248 iemFpuF128RestoreRounding(fOldRounding);
7249
7250# else
7251 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7253
7254 /* As mentioned above, enforce 68-bit internal mantissa width to better
7255 match the Intel 10980XE results. */
7256 unsigned const cPrecision = 68;
7257
7258 /* first calculate z = x * ln2 */
7259 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7260 cPrecision);
7261
7262 /* Then do the polynomial evaluation. */
7263 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7264 cPrecision, &SoftState);
7265 r = f128_mul(z, r, &SoftState);
7266
7267 /* Output the result. */
7268 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7269# endif
7270 }
7271 return fFsw;
7272}
7273
7274
7275IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7276{
7277 uint16_t const fFcw = pFpuState->FCW;
7278 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7279
7280 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7281 {
7282 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7283 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7284 else
7285 {
7286 /* Special case:
7287 2^+1.0 - 1.0 = 1.0
7288 2^-1.0 - 1.0 = -0.5 */
7289 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7290 && pr80Val->s.uMantissa == RT_BIT_64(63))
7291 {
7292 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7293 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7294 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7295 }
7296 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7297 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7298 else
7299 pFpuRes->r80Result = *pr80Val;
7300 fFsw |= X86_FSW_PE;
7301 if (!(fFcw & X86_FCW_PM))
7302 fFsw |= X86_FSW_ES | X86_FSW_B;
7303 }
7304 }
7305 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7306 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7307 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7308 pFpuRes->r80Result = *pr80Val;
7309 else if (RTFLOAT80U_IS_INF(pr80Val))
7310 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7311 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7312 {
7313 fFsw |= X86_FSW_DE;
7314 if (fFcw & X86_FCW_DM)
7315 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7316 else
7317 {
7318 pFpuRes->r80Result = *pr80Val;
7319 fFsw |= X86_FSW_ES | X86_FSW_B;
7320 }
7321 }
7322 else
7323 {
7324 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7325 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7326 && (fFcw & X86_FCW_IM))
7327 pFpuRes->r80Result = g_r80Indefinite;
7328 else
7329 {
7330 pFpuRes->r80Result = *pr80Val;
7331 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7332 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7333 }
7334 fFsw |= X86_FSW_IE;
7335 if (!(fFcw & X86_FCW_IM))
7336 fFsw |= X86_FSW_ES | X86_FSW_B;
7337 }
7338 pFpuRes->FSW = fFsw;
7339}
7340
7341#endif /* IEM_WITHOUT_ASSEMBLY */
7342
7343IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7344{
7345 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7346}
7347
7348IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7349{
7350 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7351}
7352
7353#ifdef IEM_WITHOUT_ASSEMBLY
7354
7355IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7356{
7357 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7358 pFpuRes->r80Result = *pr80Val;
7359 pFpuRes->r80Result.s.fSign = 0;
7360}
7361
7362
7363IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7364{
7365 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7366 pFpuRes->r80Result = *pr80Val;
7367 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7368}
7369
7370
7371IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7372{
7373 uint16_t const fFcw = pFpuState->FCW;
7374 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7375
7376 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7377 {
7378 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7379 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7380
7381 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7382 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7383 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7384 }
7385 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7386 {
7387 fFsw |= X86_FSW_ZE;
7388 if (fFcw & X86_FCW_ZM)
7389 {
7390 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7391 pFpuResTwo->r80Result2 = *pr80Val;
7392 }
7393 else
7394 {
7395 pFpuResTwo->r80Result2 = *pr80Val;
7396 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7397 }
7398 }
7399 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7400 {
7401 fFsw |= X86_FSW_DE;
7402 if (fFcw & X86_FCW_DM)
7403 {
7404 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7405 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7406 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7407 int32_t iExponent = -16382;
7408 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7409 {
7410 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7411 iExponent--;
7412 }
7413
7414 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7415 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7416 }
7417 else
7418 {
7419 pFpuResTwo->r80Result2 = *pr80Val;
7420 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7421 }
7422 }
7423 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7424 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7425 {
7426 pFpuResTwo->r80Result1 = *pr80Val;
7427 pFpuResTwo->r80Result2 = *pr80Val;
7428 }
7429 else if (RTFLOAT80U_IS_INF(pr80Val))
7430 {
7431 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7432 pFpuResTwo->r80Result2 = *pr80Val;
7433 }
7434 else
7435 {
7436 if (fFcw & X86_FCW_IM)
7437 {
7438 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7439 pFpuResTwo->r80Result1 = g_r80Indefinite;
7440 else
7441 {
7442 pFpuResTwo->r80Result1 = *pr80Val;
7443 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7444 }
7445 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7446 }
7447 else
7448 {
7449 pFpuResTwo->r80Result2 = *pr80Val;
7450 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7451 }
7452 fFsw |= X86_FSW_IE;
7453 }
7454 pFpuResTwo->FSW = fFsw;
7455}
7456#endif /* IEM_WITHOUT_ASSEMBLY */
7457
7458#if defined(IEM_WITHOUT_ASSEMBLY)
7459
7460static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7461{
7462 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7463 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7464 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7465 extFloat80_t v;
7466 (void)fFcw;
7467
7468 v = extF80_ylog2x(y, x, &SoftState);
7469 iemFpuSoftF80ToIprt(pr80Result, v);
7470
7471 return fFsw;
7472}
7473
7474IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7475 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7476{
7477 uint16_t const fFcw = pFpuState->FCW;
7478 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7479
7480 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7481 {
7482 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7483
7484 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7485 if (!(fFcw & X86_FCW_PM))
7486 fFsw |= X86_FSW_ES | X86_FSW_B;
7487 }
7488 else
7489 {
7490 fFsw |= X86_FSW_IE;
7491
7492 if (!(fFcw & X86_FCW_IM))
7493 {
7494 pFpuRes->r80Result = *pr80Val2;
7495 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7496 }
7497 else
7498 {
7499 pFpuRes->r80Result = g_r80Indefinite;
7500 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7501 }
7502 }
7503
7504 pFpuRes->FSW = fFsw;
7505}
7506#endif /* IEM_WITHOUT_ASSEMBLY */
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7509 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7510{
7511 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7512}
7513
7514IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7515 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7516{
7517 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7518}
7519
7520#if defined(IEM_WITHOUT_ASSEMBLY)
7521
7522static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7523{
7524 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7525 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7526 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7527 extFloat80_t v;
7528 (void)fFcw;
7529
7530 v = extF80_ylog2xp1(y, x, &SoftState);
7531 iemFpuSoftF80ToIprt(pr80Result, v);
7532
7533 return fFsw;
7534}
7535
7536IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7537 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7538{
7539 uint16_t const fFcw = pFpuState->FCW;
7540 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7541
7542 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7543 {
7544 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7545
7546 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7547 if (!(fFcw & X86_FCW_PM))
7548 fFsw |= X86_FSW_ES | X86_FSW_B;
7549 }
7550 else
7551 {
7552 fFsw |= X86_FSW_IE;
7553
7554 if (!(fFcw & X86_FCW_IM))
7555 {
7556 pFpuRes->r80Result = *pr80Val2;
7557 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7558 }
7559 else
7560 {
7561 pFpuRes->r80Result = g_r80Indefinite;
7562 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7563 }
7564 }
7565
7566 pFpuRes->FSW = fFsw;
7567}
7568
7569#endif /* IEM_WITHOUT_ASSEMBLY */
7570
7571IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7572 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7573{
7574 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7575}
7576
7577IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7578 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7579{
7580 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7581}
7582
7583
7584/*********************************************************************************************************************************
7585* MMX, SSE & AVX *
7586*********************************************************************************************************************************/
7587
7588#ifdef IEM_WITH_VEX
7589
7590/*
7591 * VMOVSLDUP
7592 */
7593IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7594{
7595 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7596 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7597 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7598 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7599 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7600 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7601 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7602 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7603}
7604
7605
7606IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7607{
7608 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7609 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7610 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7611 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7612 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7613 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7614 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7615 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7616}
7617
7618#endif /* IEM_WITH_VEX */
7619
7620
7621#ifdef IEM_WITH_VEX
7622
7623/*
7624 * VMOVSHDUP
7625 */
7626IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7627{
7628 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7629 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7630 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7631 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7632 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7633 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7634 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7635 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7636}
7637
7638
7639IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7640{
7641 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7642 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7643 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7644 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7645 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7646 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7647 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7648 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7649}
7650
7651#endif /* IEM_WITH_VEX */
7652
7653
7654#ifdef IEM_WITH_VEX
7655
7656/*
7657 * VMOVDDUP
7658 */
7659IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7660{
7661 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7662 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7663 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7664 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7665}
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7668{
7669 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7670 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7671 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7672 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7673}
7674
7675#endif /* IEM_WITH_VEX */
7676
7677
7678/*
7679 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7680 */
7681#ifdef IEM_WITHOUT_ASSEMBLY
7682
7683IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7684{
7685 RT_NOREF(pFpuState);
7686 *puDst &= *puSrc;
7687}
7688
7689
7690IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7691{
7692 RT_NOREF(pFpuState);
7693 puDst->au64[0] &= puSrc->au64[0];
7694 puDst->au64[1] &= puSrc->au64[1];
7695}
7696
7697#endif
7698
7699IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7700 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7701{
7702 RT_NOREF(pExtState);
7703 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7704 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7709 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7710{
7711 RT_NOREF(pExtState);
7712 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7713 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7714 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7715 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7716}
7717
7718
7719/*
7720 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7721 */
7722#ifdef IEM_WITHOUT_ASSEMBLY
7723
7724IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7725{
7726 RT_NOREF(pFpuState);
7727 *puDst = ~*puDst & *puSrc;
7728}
7729
7730
7731IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7732{
7733 RT_NOREF(pFpuState);
7734 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7735 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7736}
7737
7738#endif
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7745 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7750 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7751{
7752 RT_NOREF(pExtState);
7753 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7754 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7755 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7756 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7757}
7758
7759
7760/*
7761 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7762 */
7763#ifdef IEM_WITHOUT_ASSEMBLY
7764
7765IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7766{
7767 RT_NOREF(pFpuState);
7768 *puDst |= *puSrc;
7769}
7770
7771
7772IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7773{
7774 RT_NOREF(pFpuState);
7775 puDst->au64[0] |= puSrc->au64[0];
7776 puDst->au64[1] |= puSrc->au64[1];
7777}
7778
7779#endif
7780
7781IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7783{
7784 RT_NOREF(pExtState);
7785 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7786 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7787}
7788
7789
7790IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7791 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7792{
7793 RT_NOREF(pExtState);
7794 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7795 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7796 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7797 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7798}
7799
7800
7801/*
7802 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7803 */
7804#ifdef IEM_WITHOUT_ASSEMBLY
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7807{
7808 RT_NOREF(pFpuState);
7809 *puDst ^= *puSrc;
7810}
7811
7812
7813IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7814{
7815 RT_NOREF(pFpuState);
7816 puDst->au64[0] ^= puSrc->au64[0];
7817 puDst->au64[1] ^= puSrc->au64[1];
7818}
7819
7820#endif
7821
7822IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7823 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7824{
7825 RT_NOREF(pExtState);
7826 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7827 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7828}
7829
7830
7831IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7832 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7833{
7834 RT_NOREF(pExtState);
7835 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7836 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7837 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7838 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7839}
7840
7841
7842/*
7843 * PCMPEQB / VPCMPEQB
7844 */
7845#ifdef IEM_WITHOUT_ASSEMBLY
7846
7847IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7848{
7849 RT_NOREF(pFpuState);
7850 RTUINT64U uSrc1 = { *puDst };
7851 RTUINT64U uSrc2 = { *puSrc };
7852 RTUINT64U uDst;
7853 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7854 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7855 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7856 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7857 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7858 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7859 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7860 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7861 *puDst = uDst.u;
7862}
7863
7864
7865IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7866{
7867 RT_NOREF(pFpuState);
7868 RTUINT128U uSrc1 = *puDst;
7869 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7870 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7871 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7872 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7873 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7874 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7875 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7876 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7877 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7878 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7879 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7880 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7881 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7882 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7883 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7884 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7885}
7886
7887#endif
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7890 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7891{
7892 RT_NOREF(pExtState);
7893 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7894 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7895 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7896 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7897 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7898 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7899 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7900 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7901 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7902 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7903 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7904 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7905 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7906 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7907 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7908 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7909}
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7912 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7913{
7914 RT_NOREF(pExtState);
7915 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7916 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7917 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7918 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7919 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7920 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7921 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7922 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7923 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7924 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7925 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7926 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7927 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7928 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7929 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7930 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7931 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7932 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7933 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7934 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7935 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7936 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7937 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7938 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7939 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7940 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7941 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7942 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7943 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7944 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7945 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7946 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7947}
7948
7949
7950/*
7951 * PCMPEQW / VPCMPEQW
7952 */
7953#ifdef IEM_WITHOUT_ASSEMBLY
7954
7955IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7956{
7957 RT_NOREF(pFpuState);
7958 RTUINT64U uSrc1 = { *puDst };
7959 RTUINT64U uSrc2 = { *puSrc };
7960 RTUINT64U uDst;
7961 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7962 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7963 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7964 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7965 *puDst = uDst.u;
7966}
7967
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7970{
7971 RT_NOREF(pFpuState);
7972 RTUINT128U uSrc1 = *puDst;
7973 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7974 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7975 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7976 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7977 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7978 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7979 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7980 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7981}
7982
7983#endif
7984
7985IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7986 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7987{
7988 RT_NOREF(pExtState);
7989 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7990 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7991 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7992 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7993 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7994 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7995 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7996 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7997}
7998
7999IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8000 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8001{
8002 RT_NOREF(pExtState);
8003 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8004 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8005 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8006 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8007 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8008 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8009 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8010 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8011 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8012 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8013 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8014 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8015 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8016 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8017 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8018 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8019}
8020
8021
8022/*
8023 * PCMPEQD / VPCMPEQD.
8024 */
8025#ifdef IEM_WITHOUT_ASSEMBLY
8026
8027IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8028{
8029 RT_NOREF(pFpuState);
8030 RTUINT64U uSrc1 = { *puDst };
8031 RTUINT64U uSrc2 = { *puSrc };
8032 RTUINT64U uDst;
8033 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8034 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8035 *puDst = uDst.u;
8036}
8037
8038
8039IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8040{
8041 RT_NOREF(pFpuState);
8042 RTUINT128U uSrc1 = *puDst;
8043 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8044 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8045 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8046 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8047}
8048
8049#endif /* IEM_WITHOUT_ASSEMBLY */
8050
8051IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8052 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8053{
8054 RT_NOREF(pExtState);
8055 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8056 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8057 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8058 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8059}
8060
8061IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8062 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8063{
8064 RT_NOREF(pExtState);
8065 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8066 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8067 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8068 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8069 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8070 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8071 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8072 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8073}
8074
8075
8076/*
8077 * PCMPEQQ / VPCMPEQQ.
8078 */
8079IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8080{
8081 RT_NOREF(pFpuState);
8082 RTUINT128U uSrc1 = *puDst;
8083 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8084 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8085}
8086
8087IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8088 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8089{
8090 RT_NOREF(pExtState);
8091 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8092 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8093}
8094
8095IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8096 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8097{
8098 RT_NOREF(pExtState);
8099 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8100 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8101 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8102 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8103}
8104
8105
8106/*
8107 * PCMPGTB / VPCMPGTB
8108 */
8109#ifdef IEM_WITHOUT_ASSEMBLY
8110
8111IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8112{
8113 RT_NOREF(pFpuState);
8114 RTUINT64U uSrc1 = { *puDst };
8115 RTUINT64U uSrc2 = { *puSrc };
8116 RTUINT64U uDst;
8117 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8118 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8119 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8120 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8121 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8122 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8123 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8124 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8125 *puDst = uDst.u;
8126}
8127
8128
8129IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8130{
8131 RT_NOREF(pFpuState);
8132 RTUINT128U uSrc1 = *puDst;
8133 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8134 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8135 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8136 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8137 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8138 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8139 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8140 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8141 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8142 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8143 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8144 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8145 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8146 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8147 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8148 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8149}
8150
8151#endif
8152
8153IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8154 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8155{
8156 RT_NOREF(pExtState);
8157 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8158 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8159 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8160 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8161 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8162 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8163 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8164 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8165 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8166 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8167 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8168 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8169 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8170 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8171 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8172 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8173}
8174
8175IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8176 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8177{
8178 RT_NOREF(pExtState);
8179 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8180 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8181 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8182 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8183 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8184 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8185 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8186 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8187 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8188 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8189 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8190 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8191 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8192 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8193 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8194 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8195 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8196 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8197 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8198 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8199 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8200 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8201 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8202 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8203 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8204 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8205 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8206 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8207 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8208 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8209 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8210 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8211}
8212
8213
8214/*
8215 * PCMPGTW / VPCMPGTW
8216 */
8217#ifdef IEM_WITHOUT_ASSEMBLY
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8220{
8221 RT_NOREF(pFpuState);
8222 RTUINT64U uSrc1 = { *puDst };
8223 RTUINT64U uSrc2 = { *puSrc };
8224 RTUINT64U uDst;
8225 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8226 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8227 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8228 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8229 *puDst = uDst.u;
8230}
8231
8232
8233IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8234{
8235 RT_NOREF(pFpuState);
8236 RTUINT128U uSrc1 = *puDst;
8237 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8238 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8239 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8240 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8241 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8242 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8243 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8244 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8245}
8246
8247#endif
8248
8249IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8250 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8251{
8252 RT_NOREF(pExtState);
8253 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8254 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8255 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8256 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8257 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8258 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8259 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8260 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8261}
8262
8263IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8264 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8265{
8266 RT_NOREF(pExtState);
8267 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8268 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8269 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8270 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8271 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8272 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8273 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8274 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8275 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8276 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8277 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8278 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8279 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8280 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8281 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8282 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8283}
8284
8285
8286/*
8287 * PCMPGTD / VPCMPGTD.
8288 */
8289#ifdef IEM_WITHOUT_ASSEMBLY
8290
8291IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8292{
8293 RT_NOREF(pFpuState);
8294 RTUINT64U uSrc1 = { *puDst };
8295 RTUINT64U uSrc2 = { *puSrc };
8296 RTUINT64U uDst;
8297 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8298 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8299 *puDst = uDst.u;
8300}
8301
8302
8303IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8304{
8305 RT_NOREF(pFpuState);
8306 RTUINT128U uSrc1 = *puDst;
8307 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8308 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8309 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8310 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8311}
8312
8313#endif /* IEM_WITHOUT_ASSEMBLY */
8314
8315IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8316 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8317{
8318 RT_NOREF(pExtState);
8319 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8320 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8321 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8322 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8323}
8324
8325IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8326 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8327{
8328 RT_NOREF(pExtState);
8329 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8330 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8331 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8332 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8333 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8334 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8335 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8336 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8337}
8338
8339
8340/*
8341 * PCMPGTQ / VPCMPGTQ.
8342 */
8343IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8344{
8345 RT_NOREF(pFpuState);
8346 RTUINT128U uSrc1 = *puDst;
8347 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8348 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8349}
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8352 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8353{
8354 RT_NOREF(pExtState);
8355 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8356 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8357}
8358
8359IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8360 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8361{
8362 RT_NOREF(pExtState);
8363 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8364 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8365 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8366 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8367}
8368
8369
8370/*
8371 * PADDB / VPADDB
8372 */
8373#ifdef IEM_WITHOUT_ASSEMBLY
8374
8375IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8376{
8377 RT_NOREF(pFpuState);
8378 RTUINT64U uSrc1 = { *puDst };
8379 RTUINT64U uSrc2 = { *puSrc };
8380 RTUINT64U uDst;
8381 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8382 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8383 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8384 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8385 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8386 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8387 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8388 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8389 *puDst = uDst.u;
8390}
8391
8392
8393IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8394{
8395 RT_NOREF(pFpuState);
8396 RTUINT128U uSrc1 = *puDst;
8397 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8398 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8399 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8400 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8401 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8402 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8403 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8404 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8405 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8406 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8407 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8408 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8409 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8410 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8411 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8412 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8413}
8414
8415#endif
8416
8417
8418IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8419 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8420{
8421 RT_NOREF(pExtState);
8422 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8423 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8424 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8425 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8426 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8427 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8428 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8429 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8430 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8431 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8432 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8433 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8434 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8435 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8436 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8437 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8438}
8439
8440IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8441 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8442{
8443 RT_NOREF(pExtState);
8444 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8445 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8446 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8447 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8448 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8449 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8450 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8451 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8452 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8453 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8454 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8455 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8456 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8457 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8458 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8459 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8460 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8461 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8462 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8463 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8464 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8465 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8466 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8467 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8468 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8469 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8470 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8471 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8472 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8473 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8474 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8475 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8476}
8477
8478
8479/*
8480 * PADDSB / VPADDSB
8481 */
8482#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8483 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8484 ? (uint8_t)(a_iWord) \
8485 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8486
8487#ifdef IEM_WITHOUT_ASSEMBLY
8488
8489IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8490{
8491 RT_NOREF(pFpuState);
8492 RTUINT64U uSrc1 = { *puDst };
8493 RTUINT64U uSrc2 = { *puSrc };
8494 RTUINT64U uDst;
8495 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8496 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8497 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8498 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8499 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8500 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8501 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8502 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8503 *puDst = uDst.u;
8504}
8505
8506
8507IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8508{
8509 RT_NOREF(pFpuState);
8510 RTUINT128U uSrc1 = *puDst;
8511 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8512 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8513 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8514 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8515 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8516 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8517 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8518 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8519 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8520 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8521 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8522 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8523 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8524 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8525 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8526 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8527}
8528
8529#endif
8530
8531
8532/*
8533 * PADDSB / VPADDSB
8534 */
8535#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8536 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8537 ? (uint8_t)(a_uWord) \
8538 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8539
8540#ifdef IEM_WITHOUT_ASSEMBLY
8541
8542IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8543{
8544 RT_NOREF(pFpuState);
8545 RTUINT64U uSrc1 = { *puDst };
8546 RTUINT64U uSrc2 = { *puSrc };
8547 RTUINT64U uDst;
8548 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8549 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8550 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8551 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8552 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8553 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8554 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8555 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8556 *puDst = uDst.u;
8557}
8558
8559
8560IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8561{
8562 RT_NOREF(pFpuState);
8563 RTUINT128U uSrc1 = *puDst;
8564 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8565 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8566 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8567 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8568 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8569 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8570 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8571 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8572 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8573 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8574 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8575 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8576 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8577 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8578 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8579 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8580}
8581
8582#endif
8583
8584
8585/*
8586 * PADDW / VPADDW
8587 */
8588#ifdef IEM_WITHOUT_ASSEMBLY
8589
8590IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8591{
8592 RT_NOREF(pFpuState);
8593 RTUINT64U uSrc1 = { *puDst };
8594 RTUINT64U uSrc2 = { *puSrc };
8595 RTUINT64U uDst;
8596 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8597 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8598 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8599 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8600 *puDst = uDst.u;
8601}
8602
8603
8604IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8605{
8606 RT_NOREF(pFpuState);
8607 RTUINT128U uSrc1 = *puDst;
8608 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8609 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8610 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8611 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8612 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8613 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8614 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8615 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8616}
8617
8618#endif
8619
8620
8621IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8622 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8623{
8624 RT_NOREF(pExtState);
8625 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8626 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8627 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8628 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8629 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8630 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8631 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8632 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8633}
8634
8635IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8636 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8637{
8638 RT_NOREF(pExtState);
8639 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8640 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8641 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8642 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8643 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8644 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8645 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8646 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8647 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8648 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8649 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8650 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8651 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8652 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8653 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8654 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8655}
8656
8657
8658/*
8659 * PADDSW / VPADDSW
8660 */
8661#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8662 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8663 ? (uint16_t)(a_iDword) \
8664 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8665
8666#ifdef IEM_WITHOUT_ASSEMBLY
8667
8668IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8669{
8670 RT_NOREF(pFpuState);
8671 RTUINT64U uSrc1 = { *puDst };
8672 RTUINT64U uSrc2 = { *puSrc };
8673 RTUINT64U uDst;
8674 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8675 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8676 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8677 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8678 *puDst = uDst.u;
8679}
8680
8681
8682IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8683{
8684 RT_NOREF(pFpuState);
8685 RTUINT128U uSrc1 = *puDst;
8686 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8687 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8688 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8689 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8690 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8691 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8692 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8693 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8694}
8695
8696#endif
8697
8698
8699/*
8700 * PADDUSW / VPADDUSW
8701 */
8702#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8703 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8704 ? (uint16_t)(a_uDword) \
8705 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8706
8707#ifdef IEM_WITHOUT_ASSEMBLY
8708
8709IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8710{
8711 RT_NOREF(pFpuState);
8712 RTUINT64U uSrc1 = { *puDst };
8713 RTUINT64U uSrc2 = { *puSrc };
8714 RTUINT64U uDst;
8715 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8716 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8717 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8718 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8719 *puDst = uDst.u;
8720}
8721
8722
8723IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8724{
8725 RT_NOREF(pFpuState);
8726 RTUINT128U uSrc1 = *puDst;
8727 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8728 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8729 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8730 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8731 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8732 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8733 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8734 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8735}
8736
8737#endif
8738
8739
8740/*
8741 * PADDD / VPADDD.
8742 */
8743#ifdef IEM_WITHOUT_ASSEMBLY
8744
8745IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8746{
8747 RT_NOREF(pFpuState);
8748 RTUINT64U uSrc1 = { *puDst };
8749 RTUINT64U uSrc2 = { *puSrc };
8750 RTUINT64U uDst;
8751 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8752 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8753 *puDst = uDst.u;
8754}
8755
8756
8757IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8758{
8759 RT_NOREF(pFpuState);
8760 RTUINT128U uSrc1 = *puDst;
8761 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8762 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8763 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8764 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8765}
8766
8767#endif /* IEM_WITHOUT_ASSEMBLY */
8768
8769IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8770 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8771{
8772 RT_NOREF(pExtState);
8773 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8774 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8775 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8776 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8777}
8778
8779IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8780 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8781{
8782 RT_NOREF(pExtState);
8783 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8784 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8785 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8786 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8787 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8788 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8789 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8790 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8791}
8792
8793
8794/*
8795 * PADDQ / VPADDQ.
8796 */
8797#ifdef IEM_WITHOUT_ASSEMBLY
8798
8799IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8800{
8801 RT_NOREF(pFpuState);
8802 *puDst = *puDst + *puSrc;
8803}
8804
8805IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8806{
8807 RT_NOREF(pFpuState);
8808 RTUINT128U uSrc1 = *puDst;
8809 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8810 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8811}
8812
8813#endif
8814
8815IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8816 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8817{
8818 RT_NOREF(pExtState);
8819 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8820 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8821}
8822
8823IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8824 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8825{
8826 RT_NOREF(pExtState);
8827 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8828 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8829 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8830 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8831}
8832
8833
8834/*
8835 * PSUBB / VPSUBB
8836 */
8837#ifdef IEM_WITHOUT_ASSEMBLY
8838
8839IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8840{
8841 RT_NOREF(pFpuState);
8842 RTUINT64U uSrc1 = { *puDst };
8843 RTUINT64U uSrc2 = { *puSrc };
8844 RTUINT64U uDst;
8845 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8846 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8847 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8848 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8849 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8850 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8851 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8852 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8853 *puDst = uDst.u;
8854}
8855
8856
8857IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8858{
8859 RT_NOREF(pFpuState);
8860 RTUINT128U uSrc1 = *puDst;
8861 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8862 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8863 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8864 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8865 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8866 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8867 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8868 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8869 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8870 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8871 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8872 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8873 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8874 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8875 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8876 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8877}
8878
8879#endif
8880
8881IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8882 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8883{
8884 RT_NOREF(pExtState);
8885 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8886 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8887 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8888 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8889 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8890 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8891 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8892 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8893 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8894 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8895 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8896 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8897 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8898 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8899 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8900 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8901}
8902
8903IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8904 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8905{
8906 RT_NOREF(pExtState);
8907 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8908 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8909 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8910 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8911 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8912 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8913 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8914 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8915 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8916 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8917 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8918 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8919 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8920 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8921 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8922 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8923 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8924 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8925 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8926 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8927 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8928 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8929 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8930 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8931 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8932 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8933 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8934 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8935 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8936 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8937 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8938 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8939}
8940
8941
8942/*
8943 * PSUBSB / VSUBSB
8944 */
8945#ifdef IEM_WITHOUT_ASSEMBLY
8946
8947IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8948{
8949 RT_NOREF(pFpuState);
8950 RTUINT64U uSrc1 = { *puDst };
8951 RTUINT64U uSrc2 = { *puSrc };
8952 RTUINT64U uDst;
8953 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8954 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8955 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8956 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8957 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8958 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8959 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8960 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8961 *puDst = uDst.u;
8962}
8963
8964
8965IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8966{
8967 RT_NOREF(pFpuState);
8968 RTUINT128U uSrc1 = *puDst;
8969 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8970 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8971 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8972 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8973 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8974 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8975 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8976 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8977 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8978 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8979 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8980 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8981 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8982 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8983 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8984 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8985}
8986
8987#endif
8988
8989
8990/*
8991 * PADDSB / VPADDSB
8992 */
8993#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8994 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8995 ? (uint8_t)(a_uWord) \
8996 : (uint8_t)0 )
8997
8998#ifdef IEM_WITHOUT_ASSEMBLY
8999
9000IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9001{
9002 RT_NOREF(pFpuState);
9003 RTUINT64U uSrc1 = { *puDst };
9004 RTUINT64U uSrc2 = { *puSrc };
9005 RTUINT64U uDst;
9006 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9007 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9008 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9009 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9010 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9011 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9012 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9013 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9014 *puDst = uDst.u;
9015}
9016
9017
9018IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9019{
9020 RT_NOREF(pFpuState);
9021 RTUINT128U uSrc1 = *puDst;
9022 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9023 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9024 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9025 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9026 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9027 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9028 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9029 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9030 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9031 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9032 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9033 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9034 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9035 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9036 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9037 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9038}
9039
9040#endif
9041
9042
9043/*
9044 * PSUBW / VPSUBW
9045 */
9046#ifdef IEM_WITHOUT_ASSEMBLY
9047
9048IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9049{
9050 RT_NOREF(pFpuState);
9051 RTUINT64U uSrc1 = { *puDst };
9052 RTUINT64U uSrc2 = { *puSrc };
9053 RTUINT64U uDst;
9054 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9055 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9056 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9057 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9058 *puDst = uDst.u;
9059}
9060
9061
9062IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9063{
9064 RT_NOREF(pFpuState);
9065 RTUINT128U uSrc1 = *puDst;
9066 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9067 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9068 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9069 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9070 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9071 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9072 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9073 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9074}
9075
9076#endif
9077
9078IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9079 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9080{
9081 RT_NOREF(pExtState);
9082 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9083 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9084 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9085 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9086 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9087 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9088 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9089 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9090}
9091
9092IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9093 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9094{
9095 RT_NOREF(pExtState);
9096 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9097 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9098 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9099 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9100 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9101 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9102 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9103 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9104 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9105 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9106 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9107 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9108 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9109 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9110 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9111 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9112}
9113
9114
9115/*
9116 * PSUBSW / VPSUBSW
9117 */
9118#ifdef IEM_WITHOUT_ASSEMBLY
9119
9120IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9121{
9122 RT_NOREF(pFpuState);
9123 RTUINT64U uSrc1 = { *puDst };
9124 RTUINT64U uSrc2 = { *puSrc };
9125 RTUINT64U uDst;
9126 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9127 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9128 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9129 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9130 *puDst = uDst.u;
9131}
9132
9133
9134IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9135{
9136 RT_NOREF(pFpuState);
9137 RTUINT128U uSrc1 = *puDst;
9138 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9139 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9140 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9141 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9142 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9143 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9144 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9145 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9146}
9147
9148#endif
9149
9150
9151/*
9152 * PSUBUSW / VPSUBUSW
9153 */
9154#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9155 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9156 ? (uint16_t)(a_uDword) \
9157 : (uint16_t)0 )
9158
9159#ifdef IEM_WITHOUT_ASSEMBLY
9160
9161IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9162{
9163 RT_NOREF(pFpuState);
9164 RTUINT64U uSrc1 = { *puDst };
9165 RTUINT64U uSrc2 = { *puSrc };
9166 RTUINT64U uDst;
9167 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9168 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9169 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9170 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9171 *puDst = uDst.u;
9172}
9173
9174
9175IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9176{
9177 RT_NOREF(pFpuState);
9178 RTUINT128U uSrc1 = *puDst;
9179 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9180 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9181 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9182 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9183 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9184 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9185 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9186 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9187}
9188
9189#endif
9190
9191
9192/*
9193 * PSUBD / VPSUBD.
9194 */
9195#ifdef IEM_WITHOUT_ASSEMBLY
9196
9197IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9198{
9199 RT_NOREF(pFpuState);
9200 RTUINT64U uSrc1 = { *puDst };
9201 RTUINT64U uSrc2 = { *puSrc };
9202 RTUINT64U uDst;
9203 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9204 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9205 *puDst = uDst.u;
9206}
9207
9208
9209IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9210{
9211 RT_NOREF(pFpuState);
9212 RTUINT128U uSrc1 = *puDst;
9213 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9214 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9215 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9216 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9217}
9218
9219#endif /* IEM_WITHOUT_ASSEMBLY */
9220
9221IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9222 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9223{
9224 RT_NOREF(pExtState);
9225 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9226 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9227 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9228 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9229}
9230
9231IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9232 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9233{
9234 RT_NOREF(pExtState);
9235 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9236 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9237 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9238 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9239 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9240 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9241 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9242 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9243}
9244
9245
9246/*
9247 * PSUBQ / VPSUBQ.
9248 */
9249#ifdef IEM_WITHOUT_ASSEMBLY
9250
9251IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9252{
9253 RT_NOREF(pFpuState);
9254 *puDst = *puDst - *puSrc;
9255}
9256
9257IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9258{
9259 RT_NOREF(pFpuState);
9260 RTUINT128U uSrc1 = *puDst;
9261 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9262 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9263}
9264
9265#endif
9266
9267IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9268 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9269{
9270 RT_NOREF(pExtState);
9271 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9272 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9273}
9274
9275IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9276 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9277{
9278 RT_NOREF(pExtState);
9279 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9280 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9281 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9282 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9283}
9284
9285
9286
9287/*
9288 * PMULLW / VPMULLW / PMULLD / VPMULLD
9289 */
9290#ifdef IEM_WITHOUT_ASSEMBLY
9291
9292IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9293{
9294 RT_NOREF(pFpuState);
9295 RTUINT64U uSrc1 = { *puDst };
9296 RTUINT64U uSrc2 = { *puSrc };
9297 RTUINT64U uDst;
9298 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9299 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9300 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9301 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9302 *puDst = uDst.u;
9303}
9304
9305
9306IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9307{
9308 RT_NOREF(pFpuState);
9309 RTUINT128U uSrc1 = *puDst;
9310 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9311 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9312 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9313 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9314 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9315 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9316 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9317 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9318}
9319
9320#endif
9321
9322IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9323{
9324 RTUINT128U uSrc1 = *puDst;
9325
9326 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9327 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9328 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9329 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9330 RT_NOREF(pFpuState);
9331}
9332
9333
9334IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9335{
9336 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9337 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9338 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9339 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9340 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9341 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9342 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9343 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9344}
9345
9346
9347IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9348{
9349 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9350 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9351 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9352 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9353 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9354 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9355 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9356 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9357 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9358 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9359 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9360 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9361 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9362 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9363 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9364 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9365}
9366
9367
9368IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9369{
9370 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9371 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9372 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9373 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9374}
9375
9376
9377IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9378{
9379 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9380 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9381 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9382 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9383 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9384 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9385 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9386 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9387}
9388
9389
9390/*
9391 * PMULHW / VPMULHW
9392 */
9393#ifdef IEM_WITHOUT_ASSEMBLY
9394
9395IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9396{
9397 RT_NOREF(pFpuState);
9398 RTUINT64U uSrc1 = { *puDst };
9399 RTUINT64U uSrc2 = { *puSrc };
9400 RTUINT64U uDst;
9401 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9402 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9403 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9404 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9405 *puDst = uDst.u;
9406}
9407
9408
9409IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9410{
9411 RT_NOREF(pFpuState);
9412 RTUINT128U uSrc1 = *puDst;
9413 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9414 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9415 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9416 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9417 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9418 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9419 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9420 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9421}
9422
9423#endif
9424
9425IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9426{
9427 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9428 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9429 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9430 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9431 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9432 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9433 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9434 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9435}
9436
9437
9438IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9439{
9440 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9441 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9442 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9443 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9444 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9445 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9446 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9447 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9448 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9449 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9450 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9451 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9452 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9453 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9454 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9455 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9456}
9457
9458
9459/*
9460 * PMULHUW / VPMULHUW
9461 */
9462#ifdef IEM_WITHOUT_ASSEMBLY
9463
9464IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9465{
9466 RTUINT64U uSrc1 = { *puDst };
9467 RTUINT64U uSrc2 = { *puSrc };
9468 RTUINT64U uDst;
9469 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9470 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9471 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9472 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9473 *puDst = uDst.u;
9474}
9475
9476
9477IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9478{
9479 RTUINT128U uSrc1 = *puDst;
9480 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9481 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9482 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9483 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9484 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9485 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9486 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9487 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9488}
9489
9490#endif
9491
9492IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9493{
9494 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9495 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9496 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9497 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9498 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9499 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9500 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9501 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9502}
9503
9504
9505IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9506{
9507 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9508 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9509 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9510 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9511 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9512 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9513 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9514 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9515 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9516 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9517 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9518 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9519 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9520 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9521 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9522 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9523}
9524
9525
9526/*
9527 * PSRLW / VPSRLW
9528 */
9529#ifdef IEM_WITHOUT_ASSEMBLY
9530
9531IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9532{
9533 RTUINT64U uSrc1 = { *puDst };
9534 RTUINT64U uSrc2 = { *puSrc };
9535 RTUINT64U uDst;
9536
9537 if (uSrc2.au64[0] <= 15)
9538 {
9539 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9540 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9541 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9542 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9543 }
9544 else
9545 {
9546 uDst.au64[0] = 0;
9547 }
9548 *puDst = uDst.u;
9549}
9550
9551
9552IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9553{
9554 RTUINT64U uSrc1 = { *puDst };
9555 RTUINT64U uDst;
9556
9557 if (uShift <= 15)
9558 {
9559 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9560 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9561 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9562 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9563 }
9564 else
9565 {
9566 uDst.au64[0] = 0;
9567 }
9568 *puDst = uDst.u;
9569}
9570
9571
9572IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9573{
9574 RTUINT128U uSrc1 = *puDst;
9575
9576 if (puSrc->au64[0] <= 15)
9577 {
9578 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9579 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9580 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9581 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9582 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9583 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9584 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9585 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9586 }
9587 else
9588 {
9589 puDst->au64[0] = 0;
9590 puDst->au64[1] = 0;
9591 }
9592}
9593
9594IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9595{
9596 RTUINT128U uSrc1 = *puDst;
9597
9598 if (uShift <= 15)
9599 {
9600 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9601 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9602 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9603 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9604 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9605 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9606 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9607 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9608 }
9609 else
9610 {
9611 puDst->au64[0] = 0;
9612 puDst->au64[1] = 0;
9613 }
9614}
9615
9616#endif
9617
9618
9619/*
9620 * PSRAW / VPSRAW
9621 */
9622#ifdef IEM_WITHOUT_ASSEMBLY
9623
9624IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9625{
9626 RTUINT64U uSrc1 = { *puDst };
9627 RTUINT64U uSrc2 = { *puSrc };
9628 RTUINT64U uDst;
9629
9630 if (uSrc2.au64[0] <= 15)
9631 {
9632 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9633 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9634 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9635 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9636 }
9637 else
9638 {
9639 uDst.au64[0] = 0;
9640 }
9641 *puDst = uDst.u;
9642}
9643
9644
9645IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9646{
9647 RTUINT64U uSrc1 = { *puDst };
9648 RTUINT64U uDst;
9649
9650 if (uShift <= 15)
9651 {
9652 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9653 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9654 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9655 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9656 }
9657 else
9658 {
9659 uDst.au64[0] = 0;
9660 }
9661 *puDst = uDst.u;
9662}
9663
9664
9665IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9666{
9667 RTUINT128U uSrc1 = *puDst;
9668
9669 if (puSrc->au64[0] <= 15)
9670 {
9671 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9672 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9673 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9674 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9675 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9676 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9677 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9678 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9679 }
9680 else
9681 {
9682 puDst->au64[0] = 0;
9683 puDst->au64[1] = 0;
9684 }
9685}
9686
9687IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9688{
9689 RTUINT128U uSrc1 = *puDst;
9690
9691 if (uShift <= 15)
9692 {
9693 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9694 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9695 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9696 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9697 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9698 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9699 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9700 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9701 }
9702 else
9703 {
9704 puDst->au64[0] = 0;
9705 puDst->au64[1] = 0;
9706 }
9707}
9708
9709#endif
9710
9711
9712/*
9713 * PSLLW / VPSLLW
9714 */
9715#ifdef IEM_WITHOUT_ASSEMBLY
9716
9717IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9718{
9719 RTUINT64U uSrc1 = { *puDst };
9720 RTUINT64U uSrc2 = { *puSrc };
9721 RTUINT64U uDst;
9722
9723 if (uSrc2.au64[0] <= 15)
9724 {
9725 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9726 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9727 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9728 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9729 }
9730 else
9731 {
9732 uDst.au64[0] = 0;
9733 }
9734 *puDst = uDst.u;
9735}
9736
9737
9738IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9739{
9740 RTUINT64U uSrc1 = { *puDst };
9741 RTUINT64U uDst;
9742
9743 if (uShift <= 15)
9744 {
9745 uDst.au16[0] = uSrc1.au16[0] << uShift;
9746 uDst.au16[1] = uSrc1.au16[1] << uShift;
9747 uDst.au16[2] = uSrc1.au16[2] << uShift;
9748 uDst.au16[3] = uSrc1.au16[3] << uShift;
9749 }
9750 else
9751 {
9752 uDst.au64[0] = 0;
9753 }
9754 *puDst = uDst.u;
9755}
9756
9757
9758IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9759{
9760 RTUINT128U uSrc1 = *puDst;
9761
9762 if (puSrc->au64[0] <= 15)
9763 {
9764 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9765 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9766 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9767 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9768 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9769 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9770 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9771 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9772 }
9773 else
9774 {
9775 puDst->au64[0] = 0;
9776 puDst->au64[1] = 0;
9777 }
9778}
9779
9780IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9781{
9782 RTUINT128U uSrc1 = *puDst;
9783
9784 if (uShift <= 15)
9785 {
9786 puDst->au16[0] = uSrc1.au16[0] << uShift;
9787 puDst->au16[1] = uSrc1.au16[1] << uShift;
9788 puDst->au16[2] = uSrc1.au16[2] << uShift;
9789 puDst->au16[3] = uSrc1.au16[3] << uShift;
9790 puDst->au16[4] = uSrc1.au16[4] << uShift;
9791 puDst->au16[5] = uSrc1.au16[5] << uShift;
9792 puDst->au16[6] = uSrc1.au16[6] << uShift;
9793 puDst->au16[7] = uSrc1.au16[7] << uShift;
9794 }
9795 else
9796 {
9797 puDst->au64[0] = 0;
9798 puDst->au64[1] = 0;
9799 }
9800}
9801
9802#endif
9803
9804
9805/*
9806 * PSRLD / VPSRLD
9807 */
9808#ifdef IEM_WITHOUT_ASSEMBLY
9809
9810IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9811{
9812 RTUINT64U uSrc1 = { *puDst };
9813 RTUINT64U uSrc2 = { *puSrc };
9814 RTUINT64U uDst;
9815
9816 if (uSrc2.au64[0] <= 31)
9817 {
9818 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9819 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9820 }
9821 else
9822 {
9823 uDst.au64[0] = 0;
9824 }
9825 *puDst = uDst.u;
9826}
9827
9828
9829IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9830{
9831 RTUINT64U uSrc1 = { *puDst };
9832 RTUINT64U uDst;
9833
9834 if (uShift <= 31)
9835 {
9836 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9837 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9838 }
9839 else
9840 {
9841 uDst.au64[0] = 0;
9842 }
9843 *puDst = uDst.u;
9844}
9845
9846
9847IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9848{
9849 RTUINT128U uSrc1 = *puDst;
9850
9851 if (puSrc->au64[0] <= 31)
9852 {
9853 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9854 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9855 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9856 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9857 }
9858 else
9859 {
9860 puDst->au64[0] = 0;
9861 puDst->au64[1] = 0;
9862 }
9863}
9864
9865IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9866{
9867 RTUINT128U uSrc1 = *puDst;
9868
9869 if (uShift <= 31)
9870 {
9871 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9872 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9873 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9874 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9875 }
9876 else
9877 {
9878 puDst->au64[0] = 0;
9879 puDst->au64[1] = 0;
9880 }
9881}
9882
9883#endif
9884
9885
9886/*
9887 * PSRAD / VPSRAD
9888 */
9889#ifdef IEM_WITHOUT_ASSEMBLY
9890
9891IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9892{
9893 RTUINT64U uSrc1 = { *puDst };
9894 RTUINT64U uSrc2 = { *puSrc };
9895 RTUINT64U uDst;
9896
9897 if (uSrc2.au64[0] <= 31)
9898 {
9899 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9900 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9901 }
9902 else
9903 {
9904 uDst.au64[0] = 0;
9905 }
9906 *puDst = uDst.u;
9907}
9908
9909
9910IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9911{
9912 RTUINT64U uSrc1 = { *puDst };
9913 RTUINT64U uDst;
9914
9915 if (uShift <= 31)
9916 {
9917 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9918 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9919 }
9920 else
9921 {
9922 uDst.au64[0] = 0;
9923 }
9924 *puDst = uDst.u;
9925}
9926
9927
9928IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9929{
9930 RTUINT128U uSrc1 = *puDst;
9931
9932 if (puSrc->au64[0] <= 31)
9933 {
9934 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9935 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9936 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9937 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9938 }
9939 else
9940 {
9941 puDst->au64[0] = 0;
9942 puDst->au64[1] = 0;
9943 }
9944}
9945
9946IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9947{
9948 RTUINT128U uSrc1 = *puDst;
9949
9950 if (uShift <= 31)
9951 {
9952 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9953 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9954 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9955 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9956 }
9957 else
9958 {
9959 puDst->au64[0] = 0;
9960 puDst->au64[1] = 0;
9961 }
9962}
9963
9964#endif
9965
9966
9967/*
9968 * PSLLD / VPSLLD
9969 */
9970#ifdef IEM_WITHOUT_ASSEMBLY
9971
9972IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9973{
9974 RTUINT64U uSrc1 = { *puDst };
9975 RTUINT64U uSrc2 = { *puSrc };
9976 RTUINT64U uDst;
9977
9978 if (uSrc2.au64[0] <= 31)
9979 {
9980 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9981 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9982 }
9983 else
9984 {
9985 uDst.au64[0] = 0;
9986 }
9987 *puDst = uDst.u;
9988}
9989
9990
9991IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9992{
9993 RTUINT64U uSrc1 = { *puDst };
9994 RTUINT64U uDst;
9995
9996 if (uShift <= 31)
9997 {
9998 uDst.au32[0] = uSrc1.au32[0] << uShift;
9999 uDst.au32[1] = uSrc1.au32[1] << uShift;
10000 }
10001 else
10002 {
10003 uDst.au64[0] = 0;
10004 }
10005 *puDst = uDst.u;
10006}
10007
10008
10009IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10010{
10011 RTUINT128U uSrc1 = *puDst;
10012
10013 if (puSrc->au64[0] <= 31)
10014 {
10015 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10016 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10017 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10018 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10019 }
10020 else
10021 {
10022 puDst->au64[0] = 0;
10023 puDst->au64[1] = 0;
10024 }
10025}
10026
10027IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10028{
10029 RTUINT128U uSrc1 = *puDst;
10030
10031 if (uShift <= 31)
10032 {
10033 puDst->au32[0] = uSrc1.au32[0] << uShift;
10034 puDst->au32[1] = uSrc1.au32[1] << uShift;
10035 puDst->au32[2] = uSrc1.au32[2] << uShift;
10036 puDst->au32[3] = uSrc1.au32[3] << uShift;
10037 }
10038 else
10039 {
10040 puDst->au64[0] = 0;
10041 puDst->au64[1] = 0;
10042 }
10043}
10044
10045#endif
10046
10047
10048/*
10049 * PSRLQ / VPSRLQ
10050 */
10051#ifdef IEM_WITHOUT_ASSEMBLY
10052
10053IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10054{
10055 RTUINT64U uSrc1 = { *puDst };
10056 RTUINT64U uSrc2 = { *puSrc };
10057 RTUINT64U uDst;
10058
10059 if (uSrc2.au64[0] <= 63)
10060 {
10061 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10062 }
10063 else
10064 {
10065 uDst.au64[0] = 0;
10066 }
10067 *puDst = uDst.u;
10068}
10069
10070
10071IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10072{
10073 RTUINT64U uSrc1 = { *puDst };
10074 RTUINT64U uDst;
10075
10076 if (uShift <= 63)
10077 {
10078 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10079 }
10080 else
10081 {
10082 uDst.au64[0] = 0;
10083 }
10084 *puDst = uDst.u;
10085}
10086
10087
10088IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10089{
10090 RTUINT128U uSrc1 = *puDst;
10091
10092 if (puSrc->au64[0] <= 63)
10093 {
10094 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10095 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10096 }
10097 else
10098 {
10099 puDst->au64[0] = 0;
10100 puDst->au64[1] = 0;
10101 }
10102}
10103
10104IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10105{
10106 RTUINT128U uSrc1 = *puDst;
10107
10108 if (uShift <= 63)
10109 {
10110 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10111 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10112 }
10113 else
10114 {
10115 puDst->au64[0] = 0;
10116 puDst->au64[1] = 0;
10117 }
10118}
10119
10120#endif
10121
10122
10123/*
10124 * PSLLQ / VPSLLQ
10125 */
10126#ifdef IEM_WITHOUT_ASSEMBLY
10127
10128IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10129{
10130 RTUINT64U uSrc1 = { *puDst };
10131 RTUINT64U uSrc2 = { *puSrc };
10132 RTUINT64U uDst;
10133
10134 if (uSrc2.au64[0] <= 63)
10135 {
10136 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10137 }
10138 else
10139 {
10140 uDst.au64[0] = 0;
10141 }
10142 *puDst = uDst.u;
10143}
10144
10145
10146IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10147{
10148 RTUINT64U uSrc1 = { *puDst };
10149 RTUINT64U uDst;
10150
10151 if (uShift <= 63)
10152 {
10153 uDst.au64[0] = uSrc1.au64[0] << uShift;
10154 }
10155 else
10156 {
10157 uDst.au64[0] = 0;
10158 }
10159 *puDst = uDst.u;
10160}
10161
10162
10163IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10164{
10165 RTUINT128U uSrc1 = *puDst;
10166
10167 if (puSrc->au64[0] <= 63)
10168 {
10169 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10170 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10171 }
10172 else
10173 {
10174 puDst->au64[0] = 0;
10175 puDst->au64[1] = 0;
10176 }
10177}
10178
10179IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10180{
10181 RTUINT128U uSrc1 = *puDst;
10182
10183 if (uShift <= 63)
10184 {
10185 puDst->au64[0] = uSrc1.au64[0] << uShift;
10186 puDst->au64[1] = uSrc1.au64[1] << uShift;
10187 }
10188 else
10189 {
10190 puDst->au64[0] = 0;
10191 puDst->au64[1] = 0;
10192 }
10193}
10194
10195#endif
10196
10197
10198/*
10199 * PSRLDQ / VPSRLDQ
10200 */
10201#ifdef IEM_WITHOUT_ASSEMBLY
10202
10203IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10204{
10205 RTUINT128U uSrc1 = *puDst;
10206
10207 if (uShift < 16)
10208 {
10209 int i;
10210
10211 for (i = 0; i < 16 - uShift; ++i)
10212 puDst->au8[i] = uSrc1.au8[i + uShift];
10213 for (i = 16 - uShift; i < 16; ++i)
10214 puDst->au8[i] = 0;
10215 }
10216 else
10217 {
10218 puDst->au64[0] = 0;
10219 puDst->au64[1] = 0;
10220 }
10221}
10222
10223#endif
10224
10225
10226/*
10227 * PSLLDQ / VPSLLDQ
10228 */
10229#ifdef IEM_WITHOUT_ASSEMBLY
10230
10231IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10232{
10233 RTUINT128U uSrc1 = *puDst;
10234
10235 if (uShift < 16)
10236 {
10237 int i;
10238
10239 for (i = 0; i < uShift; ++i)
10240 puDst->au8[i] = 0;
10241 for (i = uShift; i < 16; ++i)
10242 puDst->au8[i] = uSrc1.au8[i - uShift];
10243 }
10244 else
10245 {
10246 puDst->au64[0] = 0;
10247 puDst->au64[1] = 0;
10248 }
10249}
10250
10251#endif
10252
10253
10254/*
10255 * PMADDWD / VPMADDWD
10256 */
10257#ifdef IEM_WITHOUT_ASSEMBLY
10258
10259IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10260{
10261 RTUINT64U uSrc1 = { *puDst };
10262 RTUINT64U uSrc2 = { *puSrc };
10263 RTUINT64U uDst;
10264
10265 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10266 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10267 *puDst = uDst.u;
10268 RT_NOREF(pFpuState);
10269}
10270
10271
10272IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10273{
10274 RTUINT128U uSrc1 = *puDst;
10275
10276 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10277 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10278 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10279 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10280 RT_NOREF(pFpuState);
10281}
10282
10283#endif
10284
10285
10286/*
10287 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10288 */
10289#ifdef IEM_WITHOUT_ASSEMBLY
10290
10291IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10292{
10293 RTUINT64U uSrc1 = { *puDst };
10294 RTUINT64U uSrc2 = { *puSrc };
10295 RTUINT64U uDst;
10296
10297 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10298 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10299 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10300 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10301 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10302 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10303 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10304 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10305 *puDst = uDst.u;
10306 RT_NOREF(pFpuState);
10307}
10308
10309
10310IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10311{
10312 RTUINT128U uSrc1 = *puDst;
10313
10314 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10315 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10316 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10317 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10318 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10319 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10320 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10321 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10322 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10323 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10324 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10325 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10326 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10327 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10328 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10329 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10330 RT_NOREF(pFpuState);
10331}
10332
10333#endif
10334
10335
10336IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10337{
10338 RTUINT128U uSrc1 = *puDst;
10339
10340 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10341 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10342 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10343 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10344 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10345 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10346 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10347 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10348 RT_NOREF(pFpuState);
10349}
10350
10351
10352IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10353{
10354 RTUINT128U uSrc1 = *puDst;
10355
10356 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10357 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10358 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10359 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10360 RT_NOREF(pFpuState);
10361}
10362
10363
10364IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10365 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10366{
10367 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10368 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10369 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10370 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10371 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10372 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10373 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10374 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10375 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10376 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10377 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10378 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10379 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10380 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10381 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10382 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10383 RT_NOREF(pExtState);
10384}
10385
10386
10387IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10388 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10389{
10390 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10391 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10392 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10393 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10394 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10395 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10396 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10397 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10398 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10399 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10400 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10401 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10402 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10403 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10404 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10405 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10406 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10407 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10408 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10409 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10410 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10411 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10412 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10413 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10414 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10415 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10416 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10417 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10418 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10419 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10420 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10421 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10422 RT_NOREF(pExtState);
10423}
10424
10425
10426IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10427 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10428{
10429 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10430 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10431 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10432 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10433 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10434 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10435 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10436 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10437 RT_NOREF(pExtState);
10438}
10439
10440
10441IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10442 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10443{
10444 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10445 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10446 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10447 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10448 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10449 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10450 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10451 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10452 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10453 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10454 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10455 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10456 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10457 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10458 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10459 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10460 RT_NOREF(pExtState);
10461}
10462
10463
10464IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10465 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10466{
10467 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10468 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10469 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10470 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10471 RT_NOREF(pExtState);
10472}
10473
10474
10475IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10476 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10477{
10478 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10479 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10480 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10481 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10482 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10483 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10484 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10485 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10486 RT_NOREF(pExtState);
10487}
10488
10489
10490/*
10491 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10492 */
10493#ifdef IEM_WITHOUT_ASSEMBLY
10494
10495IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10496{
10497 RTUINT64U uSrc1 = { *puDst };
10498 RTUINT64U uSrc2 = { *puSrc };
10499 RTUINT64U uDst;
10500
10501 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10502 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10503 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10504 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10505 *puDst = uDst.u;
10506 RT_NOREF(pFpuState);
10507}
10508
10509
10510IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10511{
10512 RTUINT128U uSrc1 = *puDst;
10513
10514 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10515 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10516 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10517 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10518 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10519 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10520 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10521 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10522 RT_NOREF(pFpuState);
10523}
10524
10525#endif
10526
10527IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10528{
10529 RTUINT128U uSrc1 = *puDst;
10530
10531 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10532 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10533 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10534 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10535 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10536 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10537 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10538 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10539 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10540 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10541 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10542 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10543 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10544 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10545 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10546 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10547 RT_NOREF(pFpuState);
10548}
10549
10550
10551IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10552{
10553 RTUINT128U uSrc1 = *puDst;
10554
10555 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10556 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10557 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10558 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10559 RT_NOREF(pFpuState);
10560}
10561
10562
10563IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10564 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10565{
10566 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10567 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10568 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10569 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10570 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10571 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10572 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10573 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10574 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10575 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10576 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10577 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10578 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10579 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10580 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10581 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10582 RT_NOREF(pExtState);
10583}
10584
10585
10586IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10587 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10588{
10589 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10590 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10591 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10592 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10593 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10594 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10595 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10596 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10597 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10598 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10599 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10600 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10601 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10602 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10603 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10604 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10605 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10606 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10607 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10608 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10609 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10610 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10611 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10612 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10613 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10614 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10615 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10616 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10617 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10618 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10619 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10620 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10621 RT_NOREF(pExtState);
10622}
10623
10624
10625IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10626 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10627{
10628 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10629 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10630 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10631 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10632 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10633 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10634 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10635 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10636 RT_NOREF(pExtState);
10637}
10638
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10641 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10642{
10643 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10644 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10645 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10646 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10647 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10648 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10649 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10650 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10651 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10652 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10653 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10654 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10655 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10656 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10657 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10658 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10659 RT_NOREF(pExtState);
10660}
10661
10662
10663IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10664 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10665{
10666 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10667 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10668 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10669 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10670 RT_NOREF(pExtState);
10671}
10672
10673
10674IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10675 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10676{
10677 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10678 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10679 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10680 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10681 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10682 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10683 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10684 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10685 RT_NOREF(pExtState);
10686}
10687
10688
10689/*
10690 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10691 */
10692#ifdef IEM_WITHOUT_ASSEMBLY
10693
10694IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10695{
10696 RTUINT64U uSrc1 = { *puDst };
10697 RTUINT64U uSrc2 = { *puSrc };
10698 RTUINT64U uDst;
10699
10700 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10701 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10702 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10703 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10704 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10705 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10706 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10707 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10708 *puDst = uDst.u;
10709 RT_NOREF(pFpuState);
10710}
10711
10712
10713IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10714{
10715 RTUINT128U uSrc1 = *puDst;
10716
10717 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10718 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10719 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10720 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10721 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10722 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10723 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10724 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10725 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10726 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10727 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10728 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10729 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10730 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10731 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10732 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10733 RT_NOREF(pFpuState);
10734}
10735
10736#endif
10737
10738IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10739{
10740 RTUINT128U uSrc1 = *puDst;
10741
10742 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10743 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10744 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10745 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10746 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10747 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10748 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10749 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10750 RT_NOREF(pFpuState);
10751}
10752
10753
10754IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10755{
10756 RTUINT128U uSrc1 = *puDst;
10757
10758 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10759 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10760 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10761 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10762 RT_NOREF(pFpuState);
10763}
10764
10765
10766IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10767 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10768{
10769 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10770 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10771 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10772 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10773 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10774 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10775 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10776 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10777 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10778 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10779 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10780 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10781 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10782 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10783 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10784 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10785 RT_NOREF(pExtState);
10786}
10787
10788
10789IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10790 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10791{
10792 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10793 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10794 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10795 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10796 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10797 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10798 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10799 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10800 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10801 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10802 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10803 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10804 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10805 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10806 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10807 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10808 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10809 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10810 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10811 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10812 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10813 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10814 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10815 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10816 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10817 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10818 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10819 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10820 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10821 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10822 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10823 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10824 RT_NOREF(pExtState);
10825}
10826
10827
10828IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10829 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10830{
10831 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10832 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10833 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10834 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10835 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10836 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10837 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10838 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10839 RT_NOREF(pExtState);
10840}
10841
10842
10843IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10844 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10845{
10846 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10847 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10848 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10849 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10850 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10851 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10852 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10853 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10854 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10855 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10856 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10857 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10858 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10859 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10860 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10861 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10862 RT_NOREF(pExtState);
10863}
10864
10865
10866IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10867 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10868{
10869 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10870 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10871 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10872 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10873 RT_NOREF(pExtState);
10874}
10875
10876
10877IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10878 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10879{
10880 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10881 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10882 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10883 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10884 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10885 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10886 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10887 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10888 RT_NOREF(pExtState);
10889}
10890
10891
10892/*
10893 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10894 */
10895#ifdef IEM_WITHOUT_ASSEMBLY
10896
10897IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10898{
10899 RTUINT64U uSrc1 = { *puDst };
10900 RTUINT64U uSrc2 = { *puSrc };
10901 RTUINT64U uDst;
10902
10903 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10904 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10905 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10906 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10907 *puDst = uDst.u;
10908 RT_NOREF(pFpuState);
10909}
10910
10911
10912IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10913{
10914 RTUINT128U uSrc1 = *puDst;
10915
10916 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10917 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10918 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10919 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10920 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10921 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10922 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10923 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10924 RT_NOREF(pFpuState);
10925}
10926
10927#endif
10928
10929IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10930{
10931 RTUINT128U uSrc1 = *puDst;
10932
10933 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10934 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10935 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10936 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10937 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10938 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10939 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10940 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10941 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10942 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10943 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10944 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10945 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10946 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10947 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10948 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10949 RT_NOREF(pFpuState);
10950}
10951
10952
10953IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10954{
10955 RTUINT128U uSrc1 = *puDst;
10956
10957 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10958 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10959 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10960 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10961 RT_NOREF(pFpuState);
10962}
10963
10964
10965IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10966 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10967{
10968 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10969 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10970 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10971 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10972 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10973 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10974 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10975 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10976 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10977 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10978 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10979 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10980 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10981 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10982 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10983 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10984 RT_NOREF(pExtState);
10985}
10986
10987
10988IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10989 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10990{
10991 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10992 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10993 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10994 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10995 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10996 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10997 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10998 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10999 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11000 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11001 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11002 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11003 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11004 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11005 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11006 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11007 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11008 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11009 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11010 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11011 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11012 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11013 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11014 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11015 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11016 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11017 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11018 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11019 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11020 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11021 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11022 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11023 RT_NOREF(pExtState);
11024}
11025
11026
11027IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11028 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11029{
11030 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11031 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11032 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11033 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11034 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11035 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11036 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11037 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11038 RT_NOREF(pExtState);
11039}
11040
11041
11042IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11043 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11044{
11045 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11046 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11047 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11048 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11049 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11050 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11051 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11052 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11053 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11054 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11055 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11056 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11057 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11058 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11059 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11060 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11061 RT_NOREF(pExtState);
11062}
11063
11064
11065IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11066 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11067{
11068 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11069 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11070 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11071 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11072 RT_NOREF(pExtState);
11073}
11074
11075
11076IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11077 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11078{
11079 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11080 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11081 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11082 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11083 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11084 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11085 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11086 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11087 RT_NOREF(pExtState);
11088}
11089
11090
11091/*
11092 * PAVGB / VPAVGB / PAVGW / VPAVGW
11093 */
11094#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11095#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11096
11097#ifdef IEM_WITHOUT_ASSEMBLY
11098
11099IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11100{
11101 RTUINT64U uSrc1 = { *puDst };
11102 RTUINT64U uSrc2 = { *puSrc };
11103 RTUINT64U uDst;
11104
11105 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11106 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11107 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11108 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11109 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11110 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11111 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11112 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11113 *puDst = uDst.u;
11114}
11115
11116
11117IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11118{
11119 RTUINT128U uSrc1 = *puDst;
11120
11121 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11122 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11123 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11124 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11125 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11126 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11127 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11128 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11129 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11130 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11131 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11132 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11133 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11134 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11135 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11136 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11137}
11138
11139
11140IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11141{
11142 RTUINT64U uSrc1 = { *puDst };
11143 RTUINT64U uSrc2 = { *puSrc };
11144 RTUINT64U uDst;
11145
11146 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11147 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11148 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11149 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11150 *puDst = uDst.u;
11151}
11152
11153
11154IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11155{
11156 RTUINT128U uSrc1 = *puDst;
11157
11158 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11159 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11160 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11161 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11162 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11163 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11164 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11165 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11166}
11167
11168#endif
11169
11170IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11171{
11172 RTUINT128U uSrc1 = *puDst;
11173
11174 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11175 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11176 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11177 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11178 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11179 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11180 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11181 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11182 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11183 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11184 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11185 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11186 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11187 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11188 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11189 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11190}
11191
11192
11193IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11194{
11195 RTUINT128U uSrc1 = *puDst;
11196
11197 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11198 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11199 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11200 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11201 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11202 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11203 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11204 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11205 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11206 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11207 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11208 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11209 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11210 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11211 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11212 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11213}
11214
11215
11216IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11217{
11218 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11219 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11220 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11221 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11222 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11223 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11224 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11225 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11226 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11227 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11228 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11229 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11230 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11231 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11232 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11233 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11234}
11235
11236
11237IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11238{
11239 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11240 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11241 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11242 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11243 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11244 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11245 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11246 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11247 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11248 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11249 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11250 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11251 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11252 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11253 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11254 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11255 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11256 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11257 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11258 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11259 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11260 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11261 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11262 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11263 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11264 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11265 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11266 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11267 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11268 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11269 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11270 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11271}
11272
11273
11274IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11275{
11276 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11277 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11278 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11279 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11280 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11281 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11282 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11283 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11284}
11285
11286
11287IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11288{
11289 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11290 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11291 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11292 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11293 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11294 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11295 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11296 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11297 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11298 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11299 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11300 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11301 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11302 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11303 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11304 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11305}
11306
11307#undef PAVGB_EXEC
11308#undef PAVGW_EXEC
11309
11310
11311/*
11312 * PMOVMSKB / VPMOVMSKB
11313 */
11314#ifdef IEM_WITHOUT_ASSEMBLY
11315
11316IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11317{
11318 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11319 uint64_t const uSrc = *pu64Src;
11320 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11321 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11322 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11323 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11324 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11325 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11326 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11327 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11328}
11329
11330
11331IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11332{
11333 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11334 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11335 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11336 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11337 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11338 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11339 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11340 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11341 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11342 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11343 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11344 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11345 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11346 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11347 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11348 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11349 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11350 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11351 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11352}
11353
11354#endif
11355
11356IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11357{
11358 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11359 uint64_t const uSrc0 = puSrc->QWords.qw0;
11360 uint64_t const uSrc1 = puSrc->QWords.qw1;
11361 uint64_t const uSrc2 = puSrc->QWords.qw2;
11362 uint64_t const uSrc3 = puSrc->QWords.qw3;
11363 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11364 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11365 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11366 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11367 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11368 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11369 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11370 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11371 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11372 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11373 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11374 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11375 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11376 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11377 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11378 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11379 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11380 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11381 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11382 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11383 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11384 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11385 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11386 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11387 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11388 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11389 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11390 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11391 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11392 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11393 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11394 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11395}
11396
11397
11398/*
11399 * [V]PSHUFB
11400 */
11401
11402IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11403{
11404 RTUINT64U const uSrc = { *puSrc };
11405 RTUINT64U const uDstIn = { *puDst };
11406 ASMCompilerBarrier();
11407 RTUINT64U uDstOut = { 0 };
11408 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11409 {
11410 uint8_t idxSrc = uSrc.au8[iByte];
11411 if (!(idxSrc & 0x80))
11412 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11413 }
11414 *puDst = uDstOut.u;
11415 RT_NOREF(pFpuState);
11416}
11417
11418
11419IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11420{
11421 RTUINT128U const uSrc = *puSrc;
11422 RTUINT128U const uDstIn = *puDst;
11423 ASMCompilerBarrier();
11424 puDst->au64[0] = 0;
11425 puDst->au64[1] = 0;
11426 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11427 {
11428 uint8_t idxSrc = uSrc.au8[iByte];
11429 if (!(idxSrc & 0x80))
11430 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11431 }
11432 RT_NOREF(pFpuState);
11433}
11434
11435
11436IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11437 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11438{
11439 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11440 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11441 ASMCompilerBarrier();
11442 puDst->au64[0] = 0;
11443 puDst->au64[1] = 0;
11444 for (unsigned iByte = 0; iByte < 16; iByte++)
11445 {
11446 uint8_t idxSrc = uSrc2.au8[iByte];
11447 if (!(idxSrc & 0x80))
11448 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11449 }
11450 RT_NOREF(pExtState);
11451}
11452
11453
11454IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11455 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11456{
11457 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11458 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11459 ASMCompilerBarrier();
11460 puDst->au64[0] = 0;
11461 puDst->au64[1] = 0;
11462 puDst->au64[2] = 0;
11463 puDst->au64[3] = 0;
11464 for (unsigned iByte = 0; iByte < 16; iByte++)
11465 {
11466 uint8_t idxSrc = uSrc2.au8[iByte];
11467 if (!(idxSrc & 0x80))
11468 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11469 }
11470 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11471 {
11472 uint8_t idxSrc = uSrc2.au8[iByte];
11473 if (!(idxSrc & 0x80))
11474 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11475 }
11476 RT_NOREF(pExtState);
11477}
11478
11479
11480/*
11481 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11482 */
11483#ifdef IEM_WITHOUT_ASSEMBLY
11484
11485IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11486{
11487 uint64_t const uSrc = *puSrc;
11488 ASMCompilerBarrier();
11489 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11490 uSrc >> (((bEvil >> 2) & 3) * 16),
11491 uSrc >> (((bEvil >> 4) & 3) * 16),
11492 uSrc >> (((bEvil >> 6) & 3) * 16));
11493}
11494
11495
11496IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11497{
11498 puDst->QWords.qw0 = puSrc->QWords.qw0;
11499 uint64_t const uSrc = puSrc->QWords.qw1;
11500 ASMCompilerBarrier();
11501 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11502 uSrc >> (((bEvil >> 2) & 3) * 16),
11503 uSrc >> (((bEvil >> 4) & 3) * 16),
11504 uSrc >> (((bEvil >> 6) & 3) * 16));
11505}
11506
11507#endif
11508
11509IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11510{
11511 puDst->QWords.qw0 = puSrc->QWords.qw0;
11512 uint64_t const uSrc1 = puSrc->QWords.qw1;
11513 puDst->QWords.qw2 = puSrc->QWords.qw2;
11514 uint64_t const uSrc3 = puSrc->QWords.qw3;
11515 ASMCompilerBarrier();
11516 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11517 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11518 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11519 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11520 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11521 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11522 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11523 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11524}
11525
11526#ifdef IEM_WITHOUT_ASSEMBLY
11527IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11528{
11529 puDst->QWords.qw1 = puSrc->QWords.qw1;
11530 uint64_t const uSrc = puSrc->QWords.qw0;
11531 ASMCompilerBarrier();
11532 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11533 uSrc >> (((bEvil >> 2) & 3) * 16),
11534 uSrc >> (((bEvil >> 4) & 3) * 16),
11535 uSrc >> (((bEvil >> 6) & 3) * 16));
11536
11537}
11538#endif
11539
11540
11541IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11542{
11543 puDst->QWords.qw3 = puSrc->QWords.qw3;
11544 uint64_t const uSrc2 = puSrc->QWords.qw2;
11545 puDst->QWords.qw1 = puSrc->QWords.qw1;
11546 uint64_t const uSrc0 = puSrc->QWords.qw0;
11547 ASMCompilerBarrier();
11548 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11549 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11550 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11551 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11552 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11553 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11554 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11555 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11556
11557}
11558
11559
11560#ifdef IEM_WITHOUT_ASSEMBLY
11561IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11562{
11563 RTUINT128U const uSrc = *puSrc;
11564 ASMCompilerBarrier();
11565 puDst->au32[0] = uSrc.au32[bEvil & 3];
11566 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11567 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11568 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11569}
11570#endif
11571
11572
11573IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11574{
11575 RTUINT256U const uSrc = *puSrc;
11576 ASMCompilerBarrier();
11577 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11578 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11579 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11580 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11581 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11582 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11583 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11584 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11585}
11586
11587
11588/*
11589 * PUNPCKHBW - high bytes -> words
11590 */
11591#ifdef IEM_WITHOUT_ASSEMBLY
11592
11593IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11594{
11595 RTUINT64U const uSrc2 = { *puSrc };
11596 RTUINT64U const uSrc1 = { *puDst };
11597 ASMCompilerBarrier();
11598 RTUINT64U uDstOut;
11599 uDstOut.au8[0] = uSrc1.au8[4];
11600 uDstOut.au8[1] = uSrc2.au8[4];
11601 uDstOut.au8[2] = uSrc1.au8[5];
11602 uDstOut.au8[3] = uSrc2.au8[5];
11603 uDstOut.au8[4] = uSrc1.au8[6];
11604 uDstOut.au8[5] = uSrc2.au8[6];
11605 uDstOut.au8[6] = uSrc1.au8[7];
11606 uDstOut.au8[7] = uSrc2.au8[7];
11607 *puDst = uDstOut.u;
11608}
11609
11610
11611IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11612{
11613 RTUINT128U const uSrc2 = *puSrc;
11614 RTUINT128U const uSrc1 = *puDst;
11615 ASMCompilerBarrier();
11616 RTUINT128U uDstOut;
11617 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11618 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11619 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11620 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11621 uDstOut.au8[ 4] = uSrc1.au8[10];
11622 uDstOut.au8[ 5] = uSrc2.au8[10];
11623 uDstOut.au8[ 6] = uSrc1.au8[11];
11624 uDstOut.au8[ 7] = uSrc2.au8[11];
11625 uDstOut.au8[ 8] = uSrc1.au8[12];
11626 uDstOut.au8[ 9] = uSrc2.au8[12];
11627 uDstOut.au8[10] = uSrc1.au8[13];
11628 uDstOut.au8[11] = uSrc2.au8[13];
11629 uDstOut.au8[12] = uSrc1.au8[14];
11630 uDstOut.au8[13] = uSrc2.au8[14];
11631 uDstOut.au8[14] = uSrc1.au8[15];
11632 uDstOut.au8[15] = uSrc2.au8[15];
11633 *puDst = uDstOut;
11634}
11635
11636#endif
11637
11638IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11639{
11640 RTUINT128U const uSrc2 = *puSrc2;
11641 RTUINT128U const uSrc1 = *puSrc1;
11642 ASMCompilerBarrier();
11643 RTUINT128U uDstOut;
11644 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11645 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11646 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11647 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11648 uDstOut.au8[ 4] = uSrc1.au8[10];
11649 uDstOut.au8[ 5] = uSrc2.au8[10];
11650 uDstOut.au8[ 6] = uSrc1.au8[11];
11651 uDstOut.au8[ 7] = uSrc2.au8[11];
11652 uDstOut.au8[ 8] = uSrc1.au8[12];
11653 uDstOut.au8[ 9] = uSrc2.au8[12];
11654 uDstOut.au8[10] = uSrc1.au8[13];
11655 uDstOut.au8[11] = uSrc2.au8[13];
11656 uDstOut.au8[12] = uSrc1.au8[14];
11657 uDstOut.au8[13] = uSrc2.au8[14];
11658 uDstOut.au8[14] = uSrc1.au8[15];
11659 uDstOut.au8[15] = uSrc2.au8[15];
11660 *puDst = uDstOut;
11661}
11662
11663
11664IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11665{
11666 RTUINT256U const uSrc2 = *puSrc2;
11667 RTUINT256U const uSrc1 = *puSrc1;
11668 ASMCompilerBarrier();
11669 RTUINT256U uDstOut;
11670 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11671 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11672 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11673 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11674 uDstOut.au8[ 4] = uSrc1.au8[10];
11675 uDstOut.au8[ 5] = uSrc2.au8[10];
11676 uDstOut.au8[ 6] = uSrc1.au8[11];
11677 uDstOut.au8[ 7] = uSrc2.au8[11];
11678 uDstOut.au8[ 8] = uSrc1.au8[12];
11679 uDstOut.au8[ 9] = uSrc2.au8[12];
11680 uDstOut.au8[10] = uSrc1.au8[13];
11681 uDstOut.au8[11] = uSrc2.au8[13];
11682 uDstOut.au8[12] = uSrc1.au8[14];
11683 uDstOut.au8[13] = uSrc2.au8[14];
11684 uDstOut.au8[14] = uSrc1.au8[15];
11685 uDstOut.au8[15] = uSrc2.au8[15];
11686 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11687 uDstOut.au8[16] = uSrc1.au8[24];
11688 uDstOut.au8[17] = uSrc2.au8[24];
11689 uDstOut.au8[18] = uSrc1.au8[25];
11690 uDstOut.au8[19] = uSrc2.au8[25];
11691 uDstOut.au8[20] = uSrc1.au8[26];
11692 uDstOut.au8[21] = uSrc2.au8[26];
11693 uDstOut.au8[22] = uSrc1.au8[27];
11694 uDstOut.au8[23] = uSrc2.au8[27];
11695 uDstOut.au8[24] = uSrc1.au8[28];
11696 uDstOut.au8[25] = uSrc2.au8[28];
11697 uDstOut.au8[26] = uSrc1.au8[29];
11698 uDstOut.au8[27] = uSrc2.au8[29];
11699 uDstOut.au8[28] = uSrc1.au8[30];
11700 uDstOut.au8[29] = uSrc2.au8[30];
11701 uDstOut.au8[30] = uSrc1.au8[31];
11702 uDstOut.au8[31] = uSrc2.au8[31];
11703 *puDst = uDstOut;
11704}
11705
11706
11707/*
11708 * PUNPCKHBW - high words -> dwords
11709 */
11710#ifdef IEM_WITHOUT_ASSEMBLY
11711
11712IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11713{
11714 RTUINT64U const uSrc2 = { *puSrc };
11715 RTUINT64U const uSrc1 = { *puDst };
11716 ASMCompilerBarrier();
11717 RTUINT64U uDstOut;
11718 uDstOut.au16[0] = uSrc1.au16[2];
11719 uDstOut.au16[1] = uSrc2.au16[2];
11720 uDstOut.au16[2] = uSrc1.au16[3];
11721 uDstOut.au16[3] = uSrc2.au16[3];
11722 *puDst = uDstOut.u;
11723}
11724
11725
11726IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11727{
11728 RTUINT128U const uSrc2 = *puSrc;
11729 RTUINT128U const uSrc1 = *puDst;
11730 ASMCompilerBarrier();
11731 RTUINT128U uDstOut;
11732 uDstOut.au16[0] = uSrc1.au16[4];
11733 uDstOut.au16[1] = uSrc2.au16[4];
11734 uDstOut.au16[2] = uSrc1.au16[5];
11735 uDstOut.au16[3] = uSrc2.au16[5];
11736 uDstOut.au16[4] = uSrc1.au16[6];
11737 uDstOut.au16[5] = uSrc2.au16[6];
11738 uDstOut.au16[6] = uSrc1.au16[7];
11739 uDstOut.au16[7] = uSrc2.au16[7];
11740 *puDst = uDstOut;
11741}
11742
11743#endif
11744
11745IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11746{
11747 RTUINT128U const uSrc2 = *puSrc2;
11748 RTUINT128U const uSrc1 = *puSrc1;
11749 ASMCompilerBarrier();
11750 RTUINT128U uDstOut;
11751 uDstOut.au16[0] = uSrc1.au16[4];
11752 uDstOut.au16[1] = uSrc2.au16[4];
11753 uDstOut.au16[2] = uSrc1.au16[5];
11754 uDstOut.au16[3] = uSrc2.au16[5];
11755 uDstOut.au16[4] = uSrc1.au16[6];
11756 uDstOut.au16[5] = uSrc2.au16[6];
11757 uDstOut.au16[6] = uSrc1.au16[7];
11758 uDstOut.au16[7] = uSrc2.au16[7];
11759 *puDst = uDstOut;
11760}
11761
11762
11763IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11764{
11765 RTUINT256U const uSrc2 = *puSrc2;
11766 RTUINT256U const uSrc1 = *puSrc1;
11767 ASMCompilerBarrier();
11768 RTUINT256U uDstOut;
11769 uDstOut.au16[0] = uSrc1.au16[4];
11770 uDstOut.au16[1] = uSrc2.au16[4];
11771 uDstOut.au16[2] = uSrc1.au16[5];
11772 uDstOut.au16[3] = uSrc2.au16[5];
11773 uDstOut.au16[4] = uSrc1.au16[6];
11774 uDstOut.au16[5] = uSrc2.au16[6];
11775 uDstOut.au16[6] = uSrc1.au16[7];
11776 uDstOut.au16[7] = uSrc2.au16[7];
11777
11778 uDstOut.au16[8] = uSrc1.au16[12];
11779 uDstOut.au16[9] = uSrc2.au16[12];
11780 uDstOut.au16[10] = uSrc1.au16[13];
11781 uDstOut.au16[11] = uSrc2.au16[13];
11782 uDstOut.au16[12] = uSrc1.au16[14];
11783 uDstOut.au16[13] = uSrc2.au16[14];
11784 uDstOut.au16[14] = uSrc1.au16[15];
11785 uDstOut.au16[15] = uSrc2.au16[15];
11786 *puDst = uDstOut;
11787}
11788
11789
11790/*
11791 * PUNPCKHBW - high dwords -> qword(s)
11792 */
11793#ifdef IEM_WITHOUT_ASSEMBLY
11794
11795IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11796{
11797 RTUINT64U const uSrc2 = { *puSrc };
11798 RTUINT64U const uSrc1 = { *puDst };
11799 ASMCompilerBarrier();
11800 RTUINT64U uDstOut;
11801 uDstOut.au32[0] = uSrc1.au32[1];
11802 uDstOut.au32[1] = uSrc2.au32[1];
11803 *puDst = uDstOut.u;
11804}
11805
11806
11807IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11808{
11809 RTUINT128U const uSrc2 = *puSrc;
11810 RTUINT128U const uSrc1 = *puDst;
11811 ASMCompilerBarrier();
11812 RTUINT128U uDstOut;
11813 uDstOut.au32[0] = uSrc1.au32[2];
11814 uDstOut.au32[1] = uSrc2.au32[2];
11815 uDstOut.au32[2] = uSrc1.au32[3];
11816 uDstOut.au32[3] = uSrc2.au32[3];
11817 *puDst = uDstOut;
11818}
11819
11820#endif
11821
11822IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11823{
11824 RTUINT128U const uSrc2 = *puSrc2;
11825 RTUINT128U const uSrc1 = *puSrc1;
11826 ASMCompilerBarrier();
11827 RTUINT128U uDstOut;
11828 uDstOut.au32[0] = uSrc1.au32[2];
11829 uDstOut.au32[1] = uSrc2.au32[2];
11830 uDstOut.au32[2] = uSrc1.au32[3];
11831 uDstOut.au32[3] = uSrc2.au32[3];
11832 *puDst = uDstOut;
11833}
11834
11835
11836IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11837{
11838 RTUINT256U const uSrc2 = *puSrc2;
11839 RTUINT256U const uSrc1 = *puSrc1;
11840 ASMCompilerBarrier();
11841 RTUINT256U uDstOut;
11842 uDstOut.au32[0] = uSrc1.au32[2];
11843 uDstOut.au32[1] = uSrc2.au32[2];
11844 uDstOut.au32[2] = uSrc1.au32[3];
11845 uDstOut.au32[3] = uSrc2.au32[3];
11846
11847 uDstOut.au32[4] = uSrc1.au32[6];
11848 uDstOut.au32[5] = uSrc2.au32[6];
11849 uDstOut.au32[6] = uSrc1.au32[7];
11850 uDstOut.au32[7] = uSrc2.au32[7];
11851 *puDst = uDstOut;
11852}
11853
11854
11855/*
11856 * PUNPCKHQDQ -> High qwords -> double qword(s).
11857 */
11858#ifdef IEM_WITHOUT_ASSEMBLY
11859IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11860{
11861 RTUINT128U const uSrc2 = *puSrc;
11862 RTUINT128U const uSrc1 = *puDst;
11863 ASMCompilerBarrier();
11864 RTUINT128U uDstOut;
11865 uDstOut.au64[0] = uSrc1.au64[1];
11866 uDstOut.au64[1] = uSrc2.au64[1];
11867 *puDst = uDstOut;
11868}
11869#endif
11870
11871
11872IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11873{
11874 RTUINT128U const uSrc2 = *puSrc2;
11875 RTUINT128U const uSrc1 = *puSrc1;
11876 ASMCompilerBarrier();
11877 RTUINT128U uDstOut;
11878 uDstOut.au64[0] = uSrc1.au64[1];
11879 uDstOut.au64[1] = uSrc2.au64[1];
11880 *puDst = uDstOut;
11881}
11882
11883
11884IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11885{
11886 RTUINT256U const uSrc2 = *puSrc2;
11887 RTUINT256U const uSrc1 = *puSrc1;
11888 ASMCompilerBarrier();
11889 RTUINT256U uDstOut;
11890 uDstOut.au64[0] = uSrc1.au64[1];
11891 uDstOut.au64[1] = uSrc2.au64[1];
11892
11893 uDstOut.au64[2] = uSrc1.au64[3];
11894 uDstOut.au64[3] = uSrc2.au64[3];
11895 *puDst = uDstOut;
11896}
11897
11898
11899/*
11900 * PUNPCKLBW - low bytes -> words
11901 */
11902#ifdef IEM_WITHOUT_ASSEMBLY
11903
11904IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11905{
11906 RTUINT64U const uSrc2 = { *puSrc };
11907 RTUINT64U const uSrc1 = { *puDst };
11908 ASMCompilerBarrier();
11909 RTUINT64U uDstOut;
11910 uDstOut.au8[0] = uSrc1.au8[0];
11911 uDstOut.au8[1] = uSrc2.au8[0];
11912 uDstOut.au8[2] = uSrc1.au8[1];
11913 uDstOut.au8[3] = uSrc2.au8[1];
11914 uDstOut.au8[4] = uSrc1.au8[2];
11915 uDstOut.au8[5] = uSrc2.au8[2];
11916 uDstOut.au8[6] = uSrc1.au8[3];
11917 uDstOut.au8[7] = uSrc2.au8[3];
11918 *puDst = uDstOut.u;
11919}
11920
11921
11922IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11923{
11924 RTUINT128U const uSrc2 = *puSrc;
11925 RTUINT128U const uSrc1 = *puDst;
11926 ASMCompilerBarrier();
11927 RTUINT128U uDstOut;
11928 uDstOut.au8[ 0] = uSrc1.au8[0];
11929 uDstOut.au8[ 1] = uSrc2.au8[0];
11930 uDstOut.au8[ 2] = uSrc1.au8[1];
11931 uDstOut.au8[ 3] = uSrc2.au8[1];
11932 uDstOut.au8[ 4] = uSrc1.au8[2];
11933 uDstOut.au8[ 5] = uSrc2.au8[2];
11934 uDstOut.au8[ 6] = uSrc1.au8[3];
11935 uDstOut.au8[ 7] = uSrc2.au8[3];
11936 uDstOut.au8[ 8] = uSrc1.au8[4];
11937 uDstOut.au8[ 9] = uSrc2.au8[4];
11938 uDstOut.au8[10] = uSrc1.au8[5];
11939 uDstOut.au8[11] = uSrc2.au8[5];
11940 uDstOut.au8[12] = uSrc1.au8[6];
11941 uDstOut.au8[13] = uSrc2.au8[6];
11942 uDstOut.au8[14] = uSrc1.au8[7];
11943 uDstOut.au8[15] = uSrc2.au8[7];
11944 *puDst = uDstOut;
11945}
11946
11947#endif
11948
11949IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11950{
11951 RTUINT128U const uSrc2 = *puSrc2;
11952 RTUINT128U const uSrc1 = *puSrc1;
11953 ASMCompilerBarrier();
11954 RTUINT128U uDstOut;
11955 uDstOut.au8[ 0] = uSrc1.au8[0];
11956 uDstOut.au8[ 1] = uSrc2.au8[0];
11957 uDstOut.au8[ 2] = uSrc1.au8[1];
11958 uDstOut.au8[ 3] = uSrc2.au8[1];
11959 uDstOut.au8[ 4] = uSrc1.au8[2];
11960 uDstOut.au8[ 5] = uSrc2.au8[2];
11961 uDstOut.au8[ 6] = uSrc1.au8[3];
11962 uDstOut.au8[ 7] = uSrc2.au8[3];
11963 uDstOut.au8[ 8] = uSrc1.au8[4];
11964 uDstOut.au8[ 9] = uSrc2.au8[4];
11965 uDstOut.au8[10] = uSrc1.au8[5];
11966 uDstOut.au8[11] = uSrc2.au8[5];
11967 uDstOut.au8[12] = uSrc1.au8[6];
11968 uDstOut.au8[13] = uSrc2.au8[6];
11969 uDstOut.au8[14] = uSrc1.au8[7];
11970 uDstOut.au8[15] = uSrc2.au8[7];
11971 *puDst = uDstOut;
11972}
11973
11974
11975IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11976{
11977 RTUINT256U const uSrc2 = *puSrc2;
11978 RTUINT256U const uSrc1 = *puSrc1;
11979 ASMCompilerBarrier();
11980 RTUINT256U uDstOut;
11981 uDstOut.au8[ 0] = uSrc1.au8[0];
11982 uDstOut.au8[ 1] = uSrc2.au8[0];
11983 uDstOut.au8[ 2] = uSrc1.au8[1];
11984 uDstOut.au8[ 3] = uSrc2.au8[1];
11985 uDstOut.au8[ 4] = uSrc1.au8[2];
11986 uDstOut.au8[ 5] = uSrc2.au8[2];
11987 uDstOut.au8[ 6] = uSrc1.au8[3];
11988 uDstOut.au8[ 7] = uSrc2.au8[3];
11989 uDstOut.au8[ 8] = uSrc1.au8[4];
11990 uDstOut.au8[ 9] = uSrc2.au8[4];
11991 uDstOut.au8[10] = uSrc1.au8[5];
11992 uDstOut.au8[11] = uSrc2.au8[5];
11993 uDstOut.au8[12] = uSrc1.au8[6];
11994 uDstOut.au8[13] = uSrc2.au8[6];
11995 uDstOut.au8[14] = uSrc1.au8[7];
11996 uDstOut.au8[15] = uSrc2.au8[7];
11997 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11998 uDstOut.au8[16] = uSrc1.au8[16];
11999 uDstOut.au8[17] = uSrc2.au8[16];
12000 uDstOut.au8[18] = uSrc1.au8[17];
12001 uDstOut.au8[19] = uSrc2.au8[17];
12002 uDstOut.au8[20] = uSrc1.au8[18];
12003 uDstOut.au8[21] = uSrc2.au8[18];
12004 uDstOut.au8[22] = uSrc1.au8[19];
12005 uDstOut.au8[23] = uSrc2.au8[19];
12006 uDstOut.au8[24] = uSrc1.au8[20];
12007 uDstOut.au8[25] = uSrc2.au8[20];
12008 uDstOut.au8[26] = uSrc1.au8[21];
12009 uDstOut.au8[27] = uSrc2.au8[21];
12010 uDstOut.au8[28] = uSrc1.au8[22];
12011 uDstOut.au8[29] = uSrc2.au8[22];
12012 uDstOut.au8[30] = uSrc1.au8[23];
12013 uDstOut.au8[31] = uSrc2.au8[23];
12014 *puDst = uDstOut;
12015}
12016
12017
12018/*
12019 * PUNPCKLBW - low words -> dwords
12020 */
12021#ifdef IEM_WITHOUT_ASSEMBLY
12022
12023IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12024{
12025 RTUINT64U const uSrc2 = { *puSrc };
12026 RTUINT64U const uSrc1 = { *puDst };
12027 ASMCompilerBarrier();
12028 RTUINT64U uDstOut;
12029 uDstOut.au16[0] = uSrc1.au16[0];
12030 uDstOut.au16[1] = uSrc2.au16[0];
12031 uDstOut.au16[2] = uSrc1.au16[1];
12032 uDstOut.au16[3] = uSrc2.au16[1];
12033 *puDst = uDstOut.u;
12034}
12035
12036
12037IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12038{
12039 RTUINT128U const uSrc2 = *puSrc;
12040 RTUINT128U const uSrc1 = *puDst;
12041 ASMCompilerBarrier();
12042 RTUINT128U uDstOut;
12043 uDstOut.au16[0] = uSrc1.au16[0];
12044 uDstOut.au16[1] = uSrc2.au16[0];
12045 uDstOut.au16[2] = uSrc1.au16[1];
12046 uDstOut.au16[3] = uSrc2.au16[1];
12047 uDstOut.au16[4] = uSrc1.au16[2];
12048 uDstOut.au16[5] = uSrc2.au16[2];
12049 uDstOut.au16[6] = uSrc1.au16[3];
12050 uDstOut.au16[7] = uSrc2.au16[3];
12051 *puDst = uDstOut;
12052}
12053
12054#endif
12055
12056IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12057{
12058 RTUINT128U const uSrc2 = *puSrc2;
12059 RTUINT128U const uSrc1 = *puSrc1;
12060 ASMCompilerBarrier();
12061 RTUINT128U uDstOut;
12062 uDstOut.au16[0] = uSrc1.au16[0];
12063 uDstOut.au16[1] = uSrc2.au16[0];
12064 uDstOut.au16[2] = uSrc1.au16[1];
12065 uDstOut.au16[3] = uSrc2.au16[1];
12066 uDstOut.au16[4] = uSrc1.au16[2];
12067 uDstOut.au16[5] = uSrc2.au16[2];
12068 uDstOut.au16[6] = uSrc1.au16[3];
12069 uDstOut.au16[7] = uSrc2.au16[3];
12070 *puDst = uDstOut;
12071}
12072
12073
12074IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12075{
12076 RTUINT256U const uSrc2 = *puSrc2;
12077 RTUINT256U const uSrc1 = *puSrc1;
12078 ASMCompilerBarrier();
12079 RTUINT256U uDstOut;
12080 uDstOut.au16[0] = uSrc1.au16[0];
12081 uDstOut.au16[1] = uSrc2.au16[0];
12082 uDstOut.au16[2] = uSrc1.au16[1];
12083 uDstOut.au16[3] = uSrc2.au16[1];
12084 uDstOut.au16[4] = uSrc1.au16[2];
12085 uDstOut.au16[5] = uSrc2.au16[2];
12086 uDstOut.au16[6] = uSrc1.au16[3];
12087 uDstOut.au16[7] = uSrc2.au16[3];
12088
12089 uDstOut.au16[8] = uSrc1.au16[8];
12090 uDstOut.au16[9] = uSrc2.au16[8];
12091 uDstOut.au16[10] = uSrc1.au16[9];
12092 uDstOut.au16[11] = uSrc2.au16[9];
12093 uDstOut.au16[12] = uSrc1.au16[10];
12094 uDstOut.au16[13] = uSrc2.au16[10];
12095 uDstOut.au16[14] = uSrc1.au16[11];
12096 uDstOut.au16[15] = uSrc2.au16[11];
12097 *puDst = uDstOut;
12098}
12099
12100
12101/*
12102 * PUNPCKLBW - low dwords -> qword(s)
12103 */
12104#ifdef IEM_WITHOUT_ASSEMBLY
12105
12106IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12107{
12108 RTUINT64U const uSrc2 = { *puSrc };
12109 RTUINT64U const uSrc1 = { *puDst };
12110 ASMCompilerBarrier();
12111 RTUINT64U uDstOut;
12112 uDstOut.au32[0] = uSrc1.au32[0];
12113 uDstOut.au32[1] = uSrc2.au32[0];
12114 *puDst = uDstOut.u;
12115}
12116
12117
12118IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12119{
12120 RTUINT128U const uSrc2 = *puSrc;
12121 RTUINT128U const uSrc1 = *puDst;
12122 ASMCompilerBarrier();
12123 RTUINT128U uDstOut;
12124 uDstOut.au32[0] = uSrc1.au32[0];
12125 uDstOut.au32[1] = uSrc2.au32[0];
12126 uDstOut.au32[2] = uSrc1.au32[1];
12127 uDstOut.au32[3] = uSrc2.au32[1];
12128 *puDst = uDstOut;
12129}
12130
12131#endif
12132
12133IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12134{
12135 RTUINT128U const uSrc2 = *puSrc2;
12136 RTUINT128U const uSrc1 = *puSrc1;
12137 ASMCompilerBarrier();
12138 RTUINT128U uDstOut;
12139 uDstOut.au32[0] = uSrc1.au32[0];
12140 uDstOut.au32[1] = uSrc2.au32[0];
12141 uDstOut.au32[2] = uSrc1.au32[1];
12142 uDstOut.au32[3] = uSrc2.au32[1];
12143 *puDst = uDstOut;
12144}
12145
12146
12147IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12148{
12149 RTUINT256U const uSrc2 = *puSrc2;
12150 RTUINT256U const uSrc1 = *puSrc1;
12151 ASMCompilerBarrier();
12152 RTUINT256U uDstOut;
12153 uDstOut.au32[0] = uSrc1.au32[0];
12154 uDstOut.au32[1] = uSrc2.au32[0];
12155 uDstOut.au32[2] = uSrc1.au32[1];
12156 uDstOut.au32[3] = uSrc2.au32[1];
12157
12158 uDstOut.au32[4] = uSrc1.au32[4];
12159 uDstOut.au32[5] = uSrc2.au32[4];
12160 uDstOut.au32[6] = uSrc1.au32[5];
12161 uDstOut.au32[7] = uSrc2.au32[5];
12162 *puDst = uDstOut;
12163}
12164
12165
12166/*
12167 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12168 */
12169#ifdef IEM_WITHOUT_ASSEMBLY
12170IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12171{
12172 RTUINT128U const uSrc2 = *puSrc;
12173 RTUINT128U const uSrc1 = *puDst;
12174 ASMCompilerBarrier();
12175 RTUINT128U uDstOut;
12176 uDstOut.au64[0] = uSrc1.au64[0];
12177 uDstOut.au64[1] = uSrc2.au64[0];
12178 *puDst = uDstOut;
12179}
12180#endif
12181
12182
12183IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12184{
12185 RTUINT128U const uSrc2 = *puSrc2;
12186 RTUINT128U const uSrc1 = *puSrc1;
12187 ASMCompilerBarrier();
12188 RTUINT128U uDstOut;
12189 uDstOut.au64[0] = uSrc1.au64[0];
12190 uDstOut.au64[1] = uSrc2.au64[0];
12191 *puDst = uDstOut;
12192}
12193
12194
12195IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12196{
12197 RTUINT256U const uSrc2 = *puSrc2;
12198 RTUINT256U const uSrc1 = *puSrc1;
12199 ASMCompilerBarrier();
12200 RTUINT256U uDstOut;
12201 uDstOut.au64[0] = uSrc1.au64[0];
12202 uDstOut.au64[1] = uSrc2.au64[0];
12203
12204 uDstOut.au64[2] = uSrc1.au64[2];
12205 uDstOut.au64[3] = uSrc2.au64[2];
12206 *puDst = uDstOut;
12207}
12208
12209
12210/*
12211 * PACKSSWB - signed words -> signed bytes
12212 */
12213
12214#ifdef IEM_WITHOUT_ASSEMBLY
12215
12216IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12217{
12218 RTUINT64U const uSrc2 = { *puSrc };
12219 RTUINT64U const uSrc1 = { *puDst };
12220 ASMCompilerBarrier();
12221 RTUINT64U uDstOut;
12222 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12223 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12224 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12225 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12226 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12227 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12228 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12229 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12230 *puDst = uDstOut.u;
12231}
12232
12233
12234IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12235{
12236 RTUINT128U const uSrc2 = *puSrc;
12237 RTUINT128U const uSrc1 = *puDst;
12238 ASMCompilerBarrier();
12239 RTUINT128U uDstOut;
12240 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12241 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12242 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12243 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12244 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12245 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12246 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12247 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12248 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12249 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12250 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12251 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12252 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12253 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12254 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12255 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12256 *puDst = uDstOut;
12257}
12258
12259#endif
12260
12261IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12262{
12263 RTUINT128U const uSrc2 = *puSrc2;
12264 RTUINT128U const uSrc1 = *puSrc1;
12265 ASMCompilerBarrier();
12266 RTUINT128U uDstOut;
12267 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12268 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12269 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12270 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12271 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12272 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12273 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12274 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12275 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12276 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12277 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12278 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12279 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12280 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12281 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12282 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12283 *puDst = uDstOut;
12284}
12285
12286
12287IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12288{
12289 RTUINT256U const uSrc2 = *puSrc2;
12290 RTUINT256U const uSrc1 = *puSrc1;
12291 ASMCompilerBarrier();
12292 RTUINT256U uDstOut;
12293 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12294 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12295 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12296 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12297 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12298 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12299 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12300 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12301 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12302 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12303 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12304 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12305 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12306 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12307 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12308 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12309
12310 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12311 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12312 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12313 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12314 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12315 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12316 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12317 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12318 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12319 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12320 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12321 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12322 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12323 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12324 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12325 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12326 *puDst = uDstOut;
12327}
12328
12329
12330/*
12331 * PACKUSWB - signed words -> unsigned bytes
12332 */
12333#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12334 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12335 ? (uint8_t)(a_iWord) \
12336 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12337
12338#ifdef IEM_WITHOUT_ASSEMBLY
12339
12340IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12341{
12342 RTUINT64U const uSrc2 = { *puSrc };
12343 RTUINT64U const uSrc1 = { *puDst };
12344 ASMCompilerBarrier();
12345 RTUINT64U uDstOut;
12346 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12347 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12348 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12349 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12350 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12351 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12352 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12353 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12354 *puDst = uDstOut.u;
12355}
12356
12357
12358IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12359{
12360 RTUINT128U const uSrc2 = *puSrc;
12361 RTUINT128U const uSrc1 = *puDst;
12362 ASMCompilerBarrier();
12363 RTUINT128U uDstOut;
12364 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12365 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12366 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12367 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12368 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12369 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12370 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12371 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12372 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12373 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12374 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12375 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12376 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12377 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12378 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12379 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12380 *puDst = uDstOut;
12381}
12382
12383#endif
12384
12385IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12386{
12387 RTUINT128U const uSrc2 = *puSrc2;
12388 RTUINT128U const uSrc1 = *puSrc1;
12389 ASMCompilerBarrier();
12390 RTUINT128U uDstOut;
12391 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12392 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12393 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12394 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12395 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12396 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12397 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12398 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12399 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12400 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12401 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12402 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12403 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12404 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12405 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12406 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12407 *puDst = uDstOut;
12408}
12409
12410
12411IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12412{
12413 RTUINT256U const uSrc2 = *puSrc2;
12414 RTUINT256U const uSrc1 = *puSrc1;
12415 ASMCompilerBarrier();
12416 RTUINT256U uDstOut;
12417 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12418 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12419 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12420 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12421 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12422 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12423 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12424 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12425 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12426 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12427 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12428 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12429 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12430 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12431 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12432 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12433
12434 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12435 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12436 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12437 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12438 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12439 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12440 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12441 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12442 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12443 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12444 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12445 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12446 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12447 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12448 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12449 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12450 *puDst = uDstOut;
12451}
12452
12453
12454/*
12455 * PACKSSDW - signed dwords -> signed words
12456 */
12457
12458#ifdef IEM_WITHOUT_ASSEMBLY
12459
12460IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12461{
12462 RTUINT64U const uSrc2 = { *puSrc };
12463 RTUINT64U const uSrc1 = { *puDst };
12464 ASMCompilerBarrier();
12465 RTUINT64U uDstOut;
12466 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12467 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12468 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12469 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12470 *puDst = uDstOut.u;
12471}
12472
12473
12474IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12475{
12476 RTUINT128U const uSrc2 = *puSrc;
12477 RTUINT128U const uSrc1 = *puDst;
12478 ASMCompilerBarrier();
12479 RTUINT128U uDstOut;
12480 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12481 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12482 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12483 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12484 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12485 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12486 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12487 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12488 *puDst = uDstOut;
12489}
12490
12491#endif
12492
12493IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12494{
12495 RTUINT128U const uSrc2 = *puSrc2;
12496 RTUINT128U const uSrc1 = *puSrc1;
12497 ASMCompilerBarrier();
12498 RTUINT128U uDstOut;
12499 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12500 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12501 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12502 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12503 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12504 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12505 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12506 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12507 *puDst = uDstOut;
12508}
12509
12510
12511IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12512{
12513 RTUINT256U const uSrc2 = *puSrc2;
12514 RTUINT256U const uSrc1 = *puSrc1;
12515 ASMCompilerBarrier();
12516 RTUINT256U uDstOut;
12517 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12518 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12519 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12520 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12521 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12522 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12523 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12524 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12525
12526 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12527 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12528 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12529 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12530 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12531 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12532 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12533 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12534 *puDst = uDstOut;
12535}
12536
12537
12538/*
12539 * PACKUSDW - signed dwords -> unsigned words
12540 */
12541#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12542 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12543 ? (uint16_t)(a_iDword) \
12544 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12545
12546#ifdef IEM_WITHOUT_ASSEMBLY
12547IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12548{
12549 RTUINT128U const uSrc2 = *puSrc;
12550 RTUINT128U const uSrc1 = *puDst;
12551 ASMCompilerBarrier();
12552 RTUINT128U uDstOut;
12553 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12554 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12555 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12556 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12557 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12558 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12559 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12560 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12561 *puDst = uDstOut;
12562}
12563#endif
12564
12565IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12566{
12567 RTUINT128U const uSrc2 = *puSrc2;
12568 RTUINT128U const uSrc1 = *puSrc1;
12569 ASMCompilerBarrier();
12570 RTUINT128U uDstOut;
12571 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12572 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12573 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12574 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12575 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12576 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12577 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12578 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12579 *puDst = uDstOut;
12580}
12581
12582
12583IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12584{
12585 RTUINT256U const uSrc2 = *puSrc2;
12586 RTUINT256U const uSrc1 = *puSrc1;
12587 ASMCompilerBarrier();
12588 RTUINT256U uDstOut;
12589 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12590 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12591 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12592 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12593 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12594 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12595 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12596 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12597
12598 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12599 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12600 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12601 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12602 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12603 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12604 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12605 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12606 *puDst = uDstOut;
12607}
12608
12609
12610/*
12611 * [V]PABSB / [V]PABSW / [V]PABSD
12612 */
12613
12614IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12615{
12616 RTUINT64U const uSrc = { *puSrc };
12617 RTUINT64U uDstOut = { 0 };
12618
12619 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12620 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12621 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12622 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12623 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12624 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12625 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12626 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12627 *puDst = uDstOut.u;
12628 RT_NOREF(pFpuState);
12629}
12630
12631
12632IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12633{
12634 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12635 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12636 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12637 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12638 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12639 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12640 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12641 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12642 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12643 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12644 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12645 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12646 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12647 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12648 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12649 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12650 RT_NOREF(pFpuState);
12651}
12652
12653
12654IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12655{
12656 RTUINT64U const uSrc = { *puSrc };
12657 RTUINT64U uDstOut = { 0 };
12658
12659 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12660 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12661 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12662 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12663 *puDst = uDstOut.u;
12664 RT_NOREF(pFpuState);
12665}
12666
12667
12668IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12669{
12670 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12671 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12672 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12673 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12674 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12675 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12676 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12677 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12678 RT_NOREF(pFpuState);
12679}
12680
12681
12682IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12683{
12684 RTUINT64U const uSrc = { *puSrc };
12685 RTUINT64U uDstOut = { 0 };
12686
12687 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12688 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12689 *puDst = uDstOut.u;
12690 RT_NOREF(pFpuState);
12691}
12692
12693
12694IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12695{
12696 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12697 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12698 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12699 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12700 RT_NOREF(pFpuState);
12701}
12702
12703
12704IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12705{
12706 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12707 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12708 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12709 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12710 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12711 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12712 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12713 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12714 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12715 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12716 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12717 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12718 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12719 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12720 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12721 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12722}
12723
12724
12725IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12726{
12727 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12728 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12729 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12730 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12731 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12732 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12733 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12734 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12735 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12736 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12737 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12738 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12739 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12740 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12741 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12742 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12743 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12744 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12745 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12746 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12747 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12748 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12749 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12750 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12751 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12752 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12753 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12754 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12755 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12756 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12757 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12758 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12759}
12760
12761
12762IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12763{
12764 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12765 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12766 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12767 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12768 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12769 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12770 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12771 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12772}
12773
12774
12775IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12776{
12777 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12778 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12779 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12780 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12781 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12782 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12783 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12784 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12785 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12786 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12787 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12788 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12789 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12790 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12791 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12792 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12793}
12794
12795
12796IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12797{
12798 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12799 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12800 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12801 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12802}
12803
12804
12805IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12806{
12807 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12808 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12809 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12810 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12811 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12812 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12813 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12814 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12815}
12816
12817
12818/*
12819 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12820 */
12821IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12822{
12823 RTUINT64U uSrc1 = { *puDst };
12824 RTUINT64U uSrc2 = { *puSrc };
12825 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12826
12827 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12828 {
12829 if (uSrc2.ai8[i] < 0)
12830 uDst.ai8[i] = -uSrc1.ai8[i];
12831 else if (uSrc2.ai8[i] == 0)
12832 uDst.ai8[i] = 0;
12833 else /* uSrc2.ai8[i] > 0 */
12834 uDst.ai8[i] = uSrc1.ai8[i];
12835 }
12836
12837 *puDst = uDst.u;
12838 RT_NOREF(pFpuState);
12839}
12840
12841
12842IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12843{
12844 RTUINT128U uSrc1 = *puDst;
12845
12846 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12847 {
12848 if (puSrc->ai8[i] < 0)
12849 puDst->ai8[i] = -uSrc1.ai8[i];
12850 else if (puSrc->ai8[i] == 0)
12851 puDst->ai8[i] = 0;
12852 else /* puSrc->ai8[i] > 0 */
12853 puDst->ai8[i] = uSrc1.ai8[i];
12854 }
12855
12856 RT_NOREF(pFpuState);
12857}
12858
12859
12860IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12861{
12862 RTUINT64U uSrc1 = { *puDst };
12863 RTUINT64U uSrc2 = { *puSrc };
12864 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12865
12866 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12867 {
12868 if (uSrc2.ai16[i] < 0)
12869 uDst.ai16[i] = -uSrc1.ai16[i];
12870 else if (uSrc2.ai16[i] == 0)
12871 uDst.ai16[i] = 0;
12872 else /* uSrc2.ai16[i] > 0 */
12873 uDst.ai16[i] = uSrc1.ai16[i];
12874 }
12875
12876 *puDst = uDst.u;
12877 RT_NOREF(pFpuState);
12878}
12879
12880
12881IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12882{
12883 RTUINT128U uSrc1 = *puDst;
12884
12885 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12886 {
12887 if (puSrc->ai16[i] < 0)
12888 puDst->ai16[i] = -uSrc1.ai16[i];
12889 else if (puSrc->ai16[i] == 0)
12890 puDst->ai16[i] = 0;
12891 else /* puSrc->ai16[i] > 0 */
12892 puDst->ai16[i] = uSrc1.ai16[i];
12893 }
12894
12895 RT_NOREF(pFpuState);
12896}
12897
12898
12899IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12900{
12901 RTUINT64U uSrc1 = { *puDst };
12902 RTUINT64U uSrc2 = { *puSrc };
12903 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12904
12905 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12906 {
12907 if (uSrc2.ai32[i] < 0)
12908 uDst.ai32[i] = -uSrc1.ai32[i];
12909 else if (uSrc2.ai32[i] == 0)
12910 uDst.ai32[i] = 0;
12911 else /* uSrc2.ai32[i] > 0 */
12912 uDst.ai32[i] = uSrc1.ai32[i];
12913 }
12914
12915 *puDst = uDst.u;
12916 RT_NOREF(pFpuState);
12917}
12918
12919
12920IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12921{
12922 RTUINT128U uSrc1 = *puDst;
12923
12924 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12925 {
12926 if (puSrc->ai32[i] < 0)
12927 puDst->ai32[i] = -uSrc1.ai32[i];
12928 else if (puSrc->ai32[i] == 0)
12929 puDst->ai32[i] = 0;
12930 else /* puSrc->ai32[i] > 0 */
12931 puDst->ai32[i] = uSrc1.ai32[i];
12932 }
12933
12934 RT_NOREF(pFpuState);
12935}
12936
12937
12938IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12939{
12940 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12941 {
12942 if (puSrc2->ai8[i] < 0)
12943 puDst->ai8[i] = -puSrc1->ai8[i];
12944 else if (puSrc2->ai8[i] == 0)
12945 puDst->ai8[i] = 0;
12946 else /* puSrc2->ai8[i] > 0 */
12947 puDst->ai8[i] = puSrc1->ai8[i];
12948 }
12949}
12950
12951
12952IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12953{
12954 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12955 {
12956 if (puSrc2->ai8[i] < 0)
12957 puDst->ai8[i] = -puSrc1->ai8[i];
12958 else if (puSrc2->ai8[i] == 0)
12959 puDst->ai8[i] = 0;
12960 else /* puSrc2->ai8[i] > 0 */
12961 puDst->ai8[i] = puSrc1->ai8[i];
12962 }
12963}
12964
12965
12966IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12967{
12968 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12969 {
12970 if (puSrc2->ai16[i] < 0)
12971 puDst->ai16[i] = -puSrc1->ai16[i];
12972 else if (puSrc2->ai16[i] == 0)
12973 puDst->ai16[i] = 0;
12974 else /* puSrc2->ai16[i] > 0 */
12975 puDst->ai16[i] = puSrc1->ai16[i];
12976 }
12977}
12978
12979
12980IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12981{
12982 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12983 {
12984 if (puSrc2->ai16[i] < 0)
12985 puDst->ai16[i] = -puSrc1->ai16[i];
12986 else if (puSrc2->ai16[i] == 0)
12987 puDst->ai16[i] = 0;
12988 else /* puSrc2->ai16[i] > 0 */
12989 puDst->ai16[i] = puSrc1->ai16[i];
12990 }
12991}
12992
12993
12994IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12995{
12996 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12997 {
12998 if (puSrc2->ai32[i] < 0)
12999 puDst->ai32[i] = -puSrc1->ai32[i];
13000 else if (puSrc2->ai32[i] == 0)
13001 puDst->ai32[i] = 0;
13002 else /* puSrc2->ai32[i] > 0 */
13003 puDst->ai32[i] = puSrc1->ai32[i];
13004 }
13005}
13006
13007
13008IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13009{
13010 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13011 {
13012 if (puSrc2->ai32[i] < 0)
13013 puDst->ai32[i] = -puSrc1->ai32[i];
13014 else if (puSrc2->ai32[i] == 0)
13015 puDst->ai32[i] = 0;
13016 else /* puSrc2->ai32[i] > 0 */
13017 puDst->ai32[i] = puSrc1->ai32[i];
13018 }
13019}
13020
13021
13022/*
13023 * PHADDW / VPHADDW / PHADDD / VPHADDD
13024 */
13025IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13026{
13027 RTUINT64U uSrc1 = { *puDst };
13028 RTUINT64U uSrc2 = { *puSrc };
13029 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13030
13031 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13032 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13033 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13034 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13035 *puDst = uDst.u;
13036 RT_NOREF(pFpuState);
13037}
13038
13039
13040IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13041{
13042 RTUINT128U uSrc1 = *puDst;
13043
13044 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13045 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13046 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13047 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13048
13049 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13050 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13051 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13052 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13053 RT_NOREF(pFpuState);
13054}
13055
13056
13057IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13058{
13059 RTUINT64U uSrc1 = { *puDst };
13060 RTUINT64U uSrc2 = { *puSrc };
13061 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13062
13063 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13064 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13065 *puDst = uDst.u;
13066 RT_NOREF(pFpuState);
13067}
13068
13069
13070IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13071{
13072 RTUINT128U uSrc1 = *puDst;
13073
13074 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13075 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13076
13077 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13078 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13079 RT_NOREF(pFpuState);
13080}
13081
13082
13083IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13084{
13085 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13086
13087 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13088 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13089 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13090 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13091
13092 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13093 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13094 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13095 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13096
13097 puDst->au64[0] = uDst.au64[0];
13098 puDst->au64[1] = uDst.au64[1];
13099}
13100
13101
13102IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13103{
13104 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13105
13106 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13107 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13108 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13109 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13110 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13111 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13112 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13113 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13114
13115 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13116 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13117 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13118 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13119 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13120 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13121 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13122 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13123
13124 puDst->au64[0] = uDst.au64[0];
13125 puDst->au64[1] = uDst.au64[1];
13126 puDst->au64[2] = uDst.au64[2];
13127 puDst->au64[3] = uDst.au64[3];
13128}
13129
13130
13131IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13132{
13133 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13134
13135 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13136 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13137
13138 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13139 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13140
13141 puDst->au64[0] = uDst.au64[0];
13142 puDst->au64[1] = uDst.au64[1];
13143}
13144
13145
13146IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13147{
13148 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13149
13150 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13151 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13152 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13153 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13154
13155 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13156 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13157 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13158 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13159
13160 puDst->au64[0] = uDst.au64[0];
13161 puDst->au64[1] = uDst.au64[1];
13162 puDst->au64[2] = uDst.au64[2];
13163 puDst->au64[3] = uDst.au64[3];
13164}
13165
13166
13167/*
13168 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13169 */
13170IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13171{
13172 RTUINT64U uSrc1 = { *puDst };
13173 RTUINT64U uSrc2 = { *puSrc };
13174 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13175
13176 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13177 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13178 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13179 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13180 *puDst = uDst.u;
13181 RT_NOREF(pFpuState);
13182}
13183
13184
13185IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13186{
13187 RTUINT128U uSrc1 = *puDst;
13188
13189 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13190 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13191 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13192 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13193
13194 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13195 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13196 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13197 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13198 RT_NOREF(pFpuState);
13199}
13200
13201
13202IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13203{
13204 RTUINT64U uSrc1 = { *puDst };
13205 RTUINT64U uSrc2 = { *puSrc };
13206 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13207
13208 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13209 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13210 *puDst = uDst.u;
13211 RT_NOREF(pFpuState);
13212}
13213
13214
13215IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13216{
13217 RTUINT128U uSrc1 = *puDst;
13218
13219 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13220 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13221
13222 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13223 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13224 RT_NOREF(pFpuState);
13225}
13226
13227
13228IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13229{
13230 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13231
13232 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13233 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13234 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13235 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13236
13237 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13238 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13239 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13240 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13241
13242 puDst->au64[0] = uDst.au64[0];
13243 puDst->au64[1] = uDst.au64[1];
13244}
13245
13246
13247IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13248{
13249 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13250
13251 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13252 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13253 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13254 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13255 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13256 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13257 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13258 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13259
13260 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13261 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13262 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13263 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13264 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13265 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13266 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13267 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13268
13269 puDst->au64[0] = uDst.au64[0];
13270 puDst->au64[1] = uDst.au64[1];
13271 puDst->au64[2] = uDst.au64[2];
13272 puDst->au64[3] = uDst.au64[3];
13273}
13274
13275
13276IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13277{
13278 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13279
13280 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13281 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13282
13283 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13284 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13285
13286 puDst->au64[0] = uDst.au64[0];
13287 puDst->au64[1] = uDst.au64[1];
13288}
13289
13290
13291IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13292{
13293 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13294
13295 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13296 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13297 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13298 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13299
13300 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13301 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13302 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13303 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13304
13305 puDst->au64[0] = uDst.au64[0];
13306 puDst->au64[1] = uDst.au64[1];
13307 puDst->au64[2] = uDst.au64[2];
13308 puDst->au64[3] = uDst.au64[3];
13309}
13310
13311
13312/*
13313 * PHADDSW / VPHADDSW
13314 */
13315IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13316{
13317 RTUINT64U uSrc1 = { *puDst };
13318 RTUINT64U uSrc2 = { *puSrc };
13319 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13320
13321 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13322 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13323 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13324 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13325 *puDst = uDst.u;
13326 RT_NOREF(pFpuState);
13327}
13328
13329
13330IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13331{
13332 RTUINT128U uSrc1 = *puDst;
13333
13334 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13335 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13336 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13337 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13338
13339 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13340 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13341 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13342 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13343 RT_NOREF(pFpuState);
13344}
13345
13346
13347IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13348{
13349 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13350
13351 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13352 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13353 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13354 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13355
13356 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13357 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13358 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13359 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13360
13361 puDst->au64[0] = uDst.au64[0];
13362 puDst->au64[1] = uDst.au64[1];
13363}
13364
13365
13366IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13367{
13368 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13369
13370 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13371 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13372 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13373 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13374 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13375 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13376 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13377 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13378
13379 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13380 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13381 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13382 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13383 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13384 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13385 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13386 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13387
13388 puDst->au64[0] = uDst.au64[0];
13389 puDst->au64[1] = uDst.au64[1];
13390 puDst->au64[2] = uDst.au64[2];
13391 puDst->au64[3] = uDst.au64[3];
13392}
13393
13394
13395/*
13396 * PHSUBSW / VPHSUBSW
13397 */
13398IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13399{
13400 RTUINT64U uSrc1 = { *puDst };
13401 RTUINT64U uSrc2 = { *puSrc };
13402 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13403
13404 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13405 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13406 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13407 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13408 *puDst = uDst.u;
13409 RT_NOREF(pFpuState);
13410}
13411
13412
13413IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13414{
13415 RTUINT128U uSrc1 = *puDst;
13416
13417 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13418 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13419 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13420 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13421
13422 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13423 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13424 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13425 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13426 RT_NOREF(pFpuState);
13427}
13428
13429
13430IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13431{
13432 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13433
13434 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13435 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13436 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13437 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13438
13439 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13440 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13441 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13442 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13443
13444 puDst->au64[0] = uDst.au64[0];
13445 puDst->au64[1] = uDst.au64[1];
13446}
13447
13448
13449IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13450{
13451 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13452
13453 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13454 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13455 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13456 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13457 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13458 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13459 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13460 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13461
13462 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13463 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13464 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13465 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13466 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13467 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13468 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13469 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13470
13471 puDst->au64[0] = uDst.au64[0];
13472 puDst->au64[1] = uDst.au64[1];
13473 puDst->au64[2] = uDst.au64[2];
13474 puDst->au64[3] = uDst.au64[3];
13475}
13476
13477
13478/*
13479 * PMADDUBSW / VPMADDUBSW
13480 */
13481IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13482{
13483 RTUINT64U uSrc1 = { *puDst };
13484 RTUINT64U uSrc2 = { *puSrc };
13485 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13486
13487 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13488 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13489 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13490 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13491 *puDst = uDst.u;
13492 RT_NOREF(pFpuState);
13493}
13494
13495
13496IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13497{
13498 RTUINT128U uSrc1 = *puDst;
13499
13500 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13501 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13502 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13503 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13504 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13505 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13506 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13507 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13508 RT_NOREF(pFpuState);
13509}
13510
13511
13512IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13513{
13514 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13515
13516 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13517 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13518 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13519 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13520 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13521 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13522 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13523 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13524
13525 puDst->au64[0] = uDst.au64[0];
13526 puDst->au64[1] = uDst.au64[1];
13527}
13528
13529
13530IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13531{
13532 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13533
13534 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13535 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13536 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13537 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13538 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13539 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13540 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13541 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13542 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13543 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13544 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13545 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13546 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13547 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13548 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13549 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13550
13551 puDst->au64[0] = uDst.au64[0];
13552 puDst->au64[1] = uDst.au64[1];
13553 puDst->au64[2] = uDst.au64[2];
13554 puDst->au64[3] = uDst.au64[3];
13555}
13556
13557
13558/*
13559 * PMULHRSW / VPMULHRSW
13560 */
13561#define DO_PMULHRSW(a_Src1, a_Src2) \
13562 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13563
13564IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13565{
13566 RTUINT64U uSrc1 = { *puDst };
13567 RTUINT64U uSrc2 = { *puSrc };
13568 RTUINT64U uDst;
13569
13570 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13571 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13572 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13573 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13574 *puDst = uDst.u;
13575 RT_NOREF(pFpuState);
13576}
13577
13578
13579IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13580{
13581 RTUINT128U uSrc1 = *puDst;
13582
13583 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13584 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13585 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13586 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13587 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13588 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13589 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13590 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13591 RT_NOREF(pFpuState);
13592}
13593
13594
13595IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13596{
13597 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13598
13599 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13600 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13601 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13602 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13603 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13604 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13605 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13606 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13607
13608 puDst->au64[0] = uDst.au64[0];
13609 puDst->au64[1] = uDst.au64[1];
13610}
13611
13612
13613IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13614{
13615 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13616
13617 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13618 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13619 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13620 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13621 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13622 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13623 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13624 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13625 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13626 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13627 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13628 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13629 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13630 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13631 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13632 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13633
13634 puDst->au64[0] = uDst.au64[0];
13635 puDst->au64[1] = uDst.au64[1];
13636 puDst->au64[2] = uDst.au64[2];
13637 puDst->au64[3] = uDst.au64[3];
13638}
13639
13640
13641/*
13642 * PSADBW / VPSADBW
13643 */
13644#ifdef IEM_WITHOUT_ASSEMBLY
13645
13646IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13647{
13648 RTUINT64U uSrc1 = { *puDst };
13649 RTUINT64U uSrc2 = { *puSrc };
13650 RTUINT64U uDst;
13651 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13652 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13653 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13654 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13655 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13656 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13657 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13658 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13659
13660 uDst.au64[0] = 0;
13661 uDst.au16[0] = uSum;
13662 *puDst = uDst.u;
13663}
13664
13665
13666IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13667{
13668 RTUINT128U uSrc1 = *puDst;
13669
13670 puDst->au64[0] = 0;
13671 puDst->au64[1] = 0;
13672
13673 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13674 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13675 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13676 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13677 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13678 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13679 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13680 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13681 puDst->au16[0] = uSum;
13682
13683 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13684 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13685 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13686 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13687 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13688 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13689 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13690 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13691 puDst->au16[4] = uSum;
13692}
13693
13694#endif
13695
13696IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13697{
13698 RTUINT128U uSrc1 = *puSrc1;
13699 RTUINT128U uSrc2 = *puSrc2;
13700
13701 puDst->au64[0] = 0;
13702 puDst->au64[1] = 0;
13703
13704 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13705 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13706 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13707 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13708 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13709 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13710 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13711 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13712 puDst->au16[0] = uSum;
13713
13714 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13715 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13716 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13717 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13718 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13719 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13720 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13721 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13722 puDst->au16[4] = uSum;
13723}
13724
13725IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13726{
13727 RTUINT256U uSrc1 = *puSrc1;
13728 RTUINT256U uSrc2 = *puSrc2;
13729
13730 puDst->au64[0] = 0;
13731 puDst->au64[1] = 0;
13732 puDst->au64[2] = 0;
13733 puDst->au64[3] = 0;
13734
13735 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13736 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13737 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13738 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13739 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13740 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13741 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13742 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13743 puDst->au16[0] = uSum;
13744
13745 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13746 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13747 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13748 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13749 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13750 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13751 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13752 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13753 puDst->au16[4] = uSum;
13754
13755 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13756 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13757 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13758 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13759 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13760 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13761 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13762 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13763 puDst->au16[8] = uSum;
13764
13765 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13766 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13767 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13768 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13769 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13770 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13771 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13772 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13773 puDst->au16[12] = uSum;
13774}
13775
13776
13777/*
13778 * PMULDQ / VPMULDQ
13779 */
13780IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13781{
13782 RTUINT128U uSrc1 = *puDst;
13783
13784 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13785 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13786}
13787
13788IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13789{
13790 RTUINT128U uSrc1 = *puSrc1;
13791 RTUINT128U uSrc2 = *puSrc2;
13792
13793 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13794 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13795}
13796
13797IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13798{
13799 RTUINT256U uSrc1 = *puSrc1;
13800 RTUINT256U uSrc2 = *puSrc2;
13801
13802 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13803 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13804 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13805 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13806}
13807
13808
13809/*
13810 * PMULUDQ / VPMULUDQ
13811 */
13812#ifdef IEM_WITHOUT_ASSEMBLY
13813
13814IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13815{
13816 RTUINT64U uSrc1 = { *puDst };
13817 RTUINT64U uSrc2 = { *puSrc };
13818 ASMCompilerBarrier();
13819 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13820 RT_NOREF(pFpuState);
13821}
13822
13823
13824IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13825{
13826 RTUINT128U uSrc1 = *puDst;
13827 RTUINT128U uSrc2 = *puSrc;
13828 ASMCompilerBarrier();
13829 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13830 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13831 RT_NOREF(pFpuState);
13832}
13833
13834#endif
13835
13836IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13837{
13838 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13839 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13840 ASMCompilerBarrier();
13841 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13842 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13843}
13844
13845
13846IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13847{
13848 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13849 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13850 ASMCompilerBarrier();
13851 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13852 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13853 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13854 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13855}
13856
13857
13858/*
13859 * UNPCKLPS / VUNPCKLPS
13860 */
13861#ifdef IEM_WITHOUT_ASSEMBLY
13862IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13863{
13864 RTUINT128U uSrc1 = *puDst;
13865 RTUINT128U uSrc2 = *puSrc;
13866 ASMCompilerBarrier();
13867 puDst->au32[0] = uSrc1.au32[0];
13868 puDst->au32[1] = uSrc2.au32[0];
13869 puDst->au32[2] = uSrc1.au32[1];
13870 puDst->au32[3] = uSrc2.au32[1];
13871}
13872
13873#endif
13874
13875IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13876{
13877 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13878 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13879 ASMCompilerBarrier();
13880 puDst->au32[0] = uSrc1.au32[0];
13881 puDst->au32[1] = uSrc2.au32[0];
13882 puDst->au32[2] = uSrc1.au32[1];
13883 puDst->au32[3] = uSrc2.au32[1];
13884}
13885
13886
13887IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13888{
13889 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13890 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13891 ASMCompilerBarrier();
13892 puDst->au32[0] = uSrc1.au32[0];
13893 puDst->au32[1] = uSrc2.au32[0];
13894 puDst->au32[2] = uSrc1.au32[1];
13895 puDst->au32[3] = uSrc2.au32[1];
13896
13897 puDst->au32[4] = uSrc1.au32[4];
13898 puDst->au32[5] = uSrc2.au32[4];
13899 puDst->au32[6] = uSrc1.au32[5];
13900 puDst->au32[7] = uSrc2.au32[5];
13901}
13902
13903
13904/*
13905 * UNPCKLPD / VUNPCKLPD
13906 */
13907#ifdef IEM_WITHOUT_ASSEMBLY
13908IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13909{
13910 RTUINT128U uSrc1 = *puDst;
13911 RTUINT128U uSrc2 = *puSrc;
13912 ASMCompilerBarrier();
13913 puDst->au64[0] = uSrc1.au64[0];
13914 puDst->au64[1] = uSrc2.au64[0];
13915}
13916
13917#endif
13918
13919IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13920{
13921 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13922 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13923 ASMCompilerBarrier();
13924 puDst->au64[0] = uSrc1.au64[0];
13925 puDst->au64[1] = uSrc2.au64[0];
13926}
13927
13928
13929IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13930{
13931 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13932 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13933 ASMCompilerBarrier();
13934 puDst->au64[0] = uSrc1.au64[0];
13935 puDst->au64[1] = uSrc2.au64[0];
13936 puDst->au64[2] = uSrc1.au64[2];
13937 puDst->au64[3] = uSrc2.au64[2];
13938}
13939
13940
13941/*
13942 * UNPCKHPS / VUNPCKHPS
13943 */
13944#ifdef IEM_WITHOUT_ASSEMBLY
13945IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13946{
13947 RTUINT128U uSrc1 = *puDst;
13948 RTUINT128U uSrc2 = *puSrc;
13949 ASMCompilerBarrier();
13950 puDst->au32[0] = uSrc1.au32[2];
13951 puDst->au32[1] = uSrc2.au32[2];
13952 puDst->au32[2] = uSrc1.au32[3];
13953 puDst->au32[3] = uSrc2.au32[3];
13954}
13955
13956#endif
13957
13958IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13959{
13960 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13961 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13962 ASMCompilerBarrier();
13963 puDst->au32[0] = uSrc1.au32[2];
13964 puDst->au32[1] = uSrc2.au32[2];
13965 puDst->au32[2] = uSrc1.au32[3];
13966 puDst->au32[3] = uSrc2.au32[3];
13967}
13968
13969
13970IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13971{
13972 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13973 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13974 ASMCompilerBarrier();
13975 puDst->au32[0] = uSrc1.au32[2];
13976 puDst->au32[1] = uSrc2.au32[2];
13977 puDst->au32[2] = uSrc1.au32[3];
13978 puDst->au32[3] = uSrc2.au32[3];
13979
13980 puDst->au32[4] = uSrc1.au32[6];
13981 puDst->au32[5] = uSrc2.au32[6];
13982 puDst->au32[6] = uSrc1.au32[7];
13983 puDst->au32[7] = uSrc2.au32[7];
13984}
13985
13986
13987/*
13988 * UNPCKHPD / VUNPCKHPD
13989 */
13990#ifdef IEM_WITHOUT_ASSEMBLY
13991IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13992{
13993 RTUINT128U uSrc1 = *puDst;
13994 RTUINT128U uSrc2 = *puSrc;
13995 ASMCompilerBarrier();
13996 puDst->au64[0] = uSrc1.au64[1];
13997 puDst->au64[1] = uSrc2.au64[1];
13998}
13999
14000#endif
14001
14002IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14003{
14004 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14005 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14006 ASMCompilerBarrier();
14007 puDst->au64[0] = uSrc1.au64[1];
14008 puDst->au64[1] = uSrc2.au64[1];
14009}
14010
14011
14012IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14013{
14014 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14015 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14016 ASMCompilerBarrier();
14017 puDst->au64[0] = uSrc1.au64[1];
14018 puDst->au64[1] = uSrc2.au64[1];
14019 puDst->au64[2] = uSrc1.au64[3];
14020 puDst->au64[3] = uSrc2.au64[3];
14021}
14022
14023
14024/*
14025 * CRC32 (SEE 4.2).
14026 */
14027
14028IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14029{
14030 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14031}
14032
14033
14034IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14035{
14036 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14037}
14038
14039IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14040{
14041 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14042}
14043
14044IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14045{
14046 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14047}
14048
14049
14050/*
14051 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14052 */
14053#ifdef IEM_WITHOUT_ASSEMBLY
14054IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14055{
14056 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14057 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14058 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14059 fEfl |= X86_EFL_ZF;
14060 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14061 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14062 fEfl |= X86_EFL_CF;
14063 *pfEFlags = fEfl;
14064}
14065#endif
14066
14067IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14068{
14069 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14070 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14071 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14072 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14073 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14074 fEfl |= X86_EFL_ZF;
14075 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14076 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14077 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14078 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14079 fEfl |= X86_EFL_CF;
14080 *pfEFlags = fEfl;
14081}
14082
14083
14084/*
14085 * PMOVSXBW / VPMOVSXBW
14086 */
14087IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14088{
14089 RTUINT64U uSrc1 = { uSrc };
14090 puDst->ai16[0] = uSrc1.ai8[0];
14091 puDst->ai16[1] = uSrc1.ai8[1];
14092 puDst->ai16[2] = uSrc1.ai8[2];
14093 puDst->ai16[3] = uSrc1.ai8[3];
14094 puDst->ai16[4] = uSrc1.ai8[4];
14095 puDst->ai16[5] = uSrc1.ai8[5];
14096 puDst->ai16[6] = uSrc1.ai8[6];
14097 puDst->ai16[7] = uSrc1.ai8[7];
14098}
14099
14100
14101IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14102{
14103 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14104 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14105 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14106 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14107 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14108 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14109 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14110 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14111 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14112 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14113 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14114 puDst->ai16[10] = uSrc1.ai8[10];
14115 puDst->ai16[11] = uSrc1.ai8[11];
14116 puDst->ai16[12] = uSrc1.ai8[12];
14117 puDst->ai16[13] = uSrc1.ai8[13];
14118 puDst->ai16[14] = uSrc1.ai8[14];
14119 puDst->ai16[15] = uSrc1.ai8[15];
14120}
14121
14122
14123/*
14124 * PMOVSXBD / VPMOVSXBD
14125 */
14126IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14127{
14128 RTUINT32U uSrc1 = { uSrc };
14129 puDst->ai32[0] = uSrc1.ai8[0];
14130 puDst->ai32[1] = uSrc1.ai8[1];
14131 puDst->ai32[2] = uSrc1.ai8[2];
14132 puDst->ai32[3] = uSrc1.ai8[3];
14133}
14134
14135
14136IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14137{
14138 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14139 puDst->ai32[0] = uSrc1.ai8[0];
14140 puDst->ai32[1] = uSrc1.ai8[1];
14141 puDst->ai32[2] = uSrc1.ai8[2];
14142 puDst->ai32[3] = uSrc1.ai8[3];
14143 puDst->ai32[4] = uSrc1.ai8[4];
14144 puDst->ai32[5] = uSrc1.ai8[5];
14145 puDst->ai32[6] = uSrc1.ai8[6];
14146 puDst->ai32[7] = uSrc1.ai8[7];
14147}
14148
14149
14150/*
14151 * PMOVSXBQ / VPMOVSXBQ
14152 */
14153IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14154{
14155 RTUINT16U uSrc1 = { uSrc };
14156 puDst->ai64[0] = uSrc1.ai8[0];
14157 puDst->ai64[1] = uSrc1.ai8[1];
14158}
14159
14160
14161IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14162{
14163 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14164 puDst->ai64[0] = uSrc1.ai8[0];
14165 puDst->ai64[1] = uSrc1.ai8[1];
14166 puDst->ai64[2] = uSrc1.ai8[2];
14167 puDst->ai64[3] = uSrc1.ai8[3];
14168}
14169
14170
14171/*
14172 * PMOVSXWD / VPMOVSXWD
14173 */
14174IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14175{
14176 RTUINT64U uSrc1 = { uSrc };
14177 puDst->ai32[0] = uSrc1.ai16[0];
14178 puDst->ai32[1] = uSrc1.ai16[1];
14179 puDst->ai32[2] = uSrc1.ai16[2];
14180 puDst->ai32[3] = uSrc1.ai16[3];
14181}
14182
14183
14184IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14185{
14186 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14187 puDst->ai32[0] = uSrc1.ai16[0];
14188 puDst->ai32[1] = uSrc1.ai16[1];
14189 puDst->ai32[2] = uSrc1.ai16[2];
14190 puDst->ai32[3] = uSrc1.ai16[3];
14191 puDst->ai32[4] = uSrc1.ai16[4];
14192 puDst->ai32[5] = uSrc1.ai16[5];
14193 puDst->ai32[6] = uSrc1.ai16[6];
14194 puDst->ai32[7] = uSrc1.ai16[7];
14195}
14196
14197
14198/*
14199 * PMOVSXWQ / VPMOVSXWQ
14200 */
14201IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14202{
14203 RTUINT32U uSrc1 = { uSrc };
14204 puDst->ai64[0] = uSrc1.ai16[0];
14205 puDst->ai64[1] = uSrc1.ai16[1];
14206}
14207
14208
14209IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14210{
14211 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14212 puDst->ai64[0] = uSrc1.ai16[0];
14213 puDst->ai64[1] = uSrc1.ai16[1];
14214 puDst->ai64[2] = uSrc1.ai16[2];
14215 puDst->ai64[3] = uSrc1.ai16[3];
14216}
14217
14218
14219/*
14220 * PMOVSXDQ / VPMOVSXDQ
14221 */
14222IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14223{
14224 RTUINT64U uSrc1 = { uSrc };
14225 puDst->ai64[0] = uSrc1.ai32[0];
14226 puDst->ai64[1] = uSrc1.ai32[1];
14227}
14228
14229
14230IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14231{
14232 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14233 puDst->ai64[0] = uSrc1.ai32[0];
14234 puDst->ai64[1] = uSrc1.ai32[1];
14235 puDst->ai64[2] = uSrc1.ai32[2];
14236 puDst->ai64[3] = uSrc1.ai32[3];
14237}
14238
14239
14240/*
14241 * PMOVZXBW / VPMOVZXBW
14242 */
14243IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14244{
14245 RTUINT64U uSrc1 = { uSrc };
14246 puDst->au16[0] = uSrc1.au8[0];
14247 puDst->au16[1] = uSrc1.au8[1];
14248 puDst->au16[2] = uSrc1.au8[2];
14249 puDst->au16[3] = uSrc1.au8[3];
14250 puDst->au16[4] = uSrc1.au8[4];
14251 puDst->au16[5] = uSrc1.au8[5];
14252 puDst->au16[6] = uSrc1.au8[6];
14253 puDst->au16[7] = uSrc1.au8[7];
14254}
14255
14256
14257IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14258{
14259 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14260 puDst->au16[ 0] = uSrc1.au8[ 0];
14261 puDst->au16[ 1] = uSrc1.au8[ 1];
14262 puDst->au16[ 2] = uSrc1.au8[ 2];
14263 puDst->au16[ 3] = uSrc1.au8[ 3];
14264 puDst->au16[ 4] = uSrc1.au8[ 4];
14265 puDst->au16[ 5] = uSrc1.au8[ 5];
14266 puDst->au16[ 6] = uSrc1.au8[ 6];
14267 puDst->au16[ 7] = uSrc1.au8[ 7];
14268 puDst->au16[ 8] = uSrc1.au8[ 8];
14269 puDst->au16[ 9] = uSrc1.au8[ 9];
14270 puDst->au16[10] = uSrc1.au8[10];
14271 puDst->au16[11] = uSrc1.au8[11];
14272 puDst->au16[12] = uSrc1.au8[12];
14273 puDst->au16[13] = uSrc1.au8[13];
14274 puDst->au16[14] = uSrc1.au8[14];
14275 puDst->au16[15] = uSrc1.au8[15];
14276}
14277
14278
14279/*
14280 * PMOVZXBD / VPMOVZXBD
14281 */
14282IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14283{
14284 RTUINT32U uSrc1 = { uSrc };
14285 puDst->au32[0] = uSrc1.au8[0];
14286 puDst->au32[1] = uSrc1.au8[1];
14287 puDst->au32[2] = uSrc1.au8[2];
14288 puDst->au32[3] = uSrc1.au8[3];
14289}
14290
14291
14292IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14293{
14294 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14295 puDst->au32[0] = uSrc1.au8[0];
14296 puDst->au32[1] = uSrc1.au8[1];
14297 puDst->au32[2] = uSrc1.au8[2];
14298 puDst->au32[3] = uSrc1.au8[3];
14299 puDst->au32[4] = uSrc1.au8[4];
14300 puDst->au32[5] = uSrc1.au8[5];
14301 puDst->au32[6] = uSrc1.au8[6];
14302 puDst->au32[7] = uSrc1.au8[7];
14303}
14304
14305
14306/*
14307 * PMOVZXBQ / VPMOVZXBQ
14308 */
14309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14310{
14311 RTUINT16U uSrc1 = { uSrc };
14312 puDst->au64[0] = uSrc1.au8[0];
14313 puDst->au64[1] = uSrc1.au8[1];
14314}
14315
14316
14317IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14318{
14319 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14320 puDst->au64[0] = uSrc1.au8[0];
14321 puDst->au64[1] = uSrc1.au8[1];
14322 puDst->au64[2] = uSrc1.au8[2];
14323 puDst->au64[3] = uSrc1.au8[3];
14324}
14325
14326
14327/*
14328 * PMOVZXWD / VPMOVZXWD
14329 */
14330IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14331{
14332 RTUINT64U uSrc1 = { uSrc };
14333 puDst->au32[0] = uSrc1.au16[0];
14334 puDst->au32[1] = uSrc1.au16[1];
14335 puDst->au32[2] = uSrc1.au16[2];
14336 puDst->au32[3] = uSrc1.au16[3];
14337}
14338
14339
14340IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14341{
14342 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14343 puDst->au32[0] = uSrc1.au16[0];
14344 puDst->au32[1] = uSrc1.au16[1];
14345 puDst->au32[2] = uSrc1.au16[2];
14346 puDst->au32[3] = uSrc1.au16[3];
14347 puDst->au32[4] = uSrc1.au16[4];
14348 puDst->au32[5] = uSrc1.au16[5];
14349 puDst->au32[6] = uSrc1.au16[6];
14350 puDst->au32[7] = uSrc1.au16[7];
14351}
14352
14353
14354/*
14355 * PMOVZXWQ / VPMOVZXWQ
14356 */
14357IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14358{
14359 RTUINT32U uSrc1 = { uSrc };
14360 puDst->au64[0] = uSrc1.au16[0];
14361 puDst->au64[1] = uSrc1.au16[1];
14362}
14363
14364
14365IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14366{
14367 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14368 puDst->au64[0] = uSrc1.au16[0];
14369 puDst->au64[1] = uSrc1.au16[1];
14370 puDst->au64[2] = uSrc1.au16[2];
14371 puDst->au64[3] = uSrc1.au16[3];
14372}
14373
14374
14375/*
14376 * PMOVZXDQ / VPMOVZXDQ
14377 */
14378IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14379{
14380 RTUINT64U uSrc1 = { uSrc };
14381 puDst->au64[0] = uSrc1.au32[0];
14382 puDst->au64[1] = uSrc1.au32[1];
14383}
14384
14385
14386IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14387{
14388 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14389 puDst->au64[0] = uSrc1.au32[0];
14390 puDst->au64[1] = uSrc1.au32[1];
14391 puDst->au64[2] = uSrc1.au32[2];
14392 puDst->au64[3] = uSrc1.au32[3];
14393}
14394
14395/**
14396 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14397 * the SoftFloat 32-bit floating point format (float32_t).
14398 *
14399 * This is only a structure format conversion, nothing else.
14400 */
14401DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14402{
14403 float32_t Tmp;
14404 Tmp.v = pr32Val->u;
14405 return Tmp;
14406}
14407
14408
14409/**
14410 * Converts from SoftFloat 32-bit floating point format (float32_t)
14411 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14412 *
14413 * This is only a structure format conversion, nothing else.
14414 */
14415DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14416{
14417 pr32Dst->u = r32XSrc.v;
14418 return pr32Dst;
14419}
14420
14421
14422/**
14423 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14424 * the SoftFloat 64-bit floating point format (float64_t).
14425 *
14426 * This is only a structure format conversion, nothing else.
14427 */
14428DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14429{
14430 float64_t Tmp;
14431 Tmp.v = pr64Val->u;
14432 return Tmp;
14433}
14434
14435
14436/**
14437 * Converts from SoftFloat 64-bit floating point format (float64_t)
14438 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14439 *
14440 * This is only a structure format conversion, nothing else.
14441 */
14442DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14443{
14444 pr64Dst->u = r64XSrc.v;
14445 return pr64Dst;
14446}
14447
14448
14449/** Initializer for the SoftFloat state structure. */
14450# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14451 { \
14452 softfloat_tininess_afterRounding, \
14453 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14454 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14455 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14456 : (uint8_t)softfloat_round_minMag, \
14457 0, \
14458 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14459 32 /* Rounding precision, not relevant for SIMD. */ \
14460 }
14461
14462#ifdef IEM_WITHOUT_ASSEMBLY
14463
14464/**
14465 * Helper for transfering exception to MXCSR and setting the result value
14466 * accordingly.
14467 *
14468 * @returns Updated MXCSR.
14469 * @param pSoftState The SoftFloat state following the operation.
14470 * @param r32Result The result of the SoftFloat operation.
14471 * @param pr32Result Where to store the result for IEM.
14472 * @param fMxcsr The original MXCSR value.
14473 */
14474DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14475 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14476{
14477 iemFpSoftF32ToIprt(pr32Result, r32Result);
14478
14479 uint8_t fXcpt = pSoftState->exceptionFlags;
14480 if ( (fMxcsr & X86_MXCSR_FZ)
14481 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14482 {
14483 /* Underflow masked and flush to zero is set. */
14484 pr32Result->s.uFraction = 0;
14485 pr32Result->s.uExponent = 0;
14486 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14487 }
14488
14489 /* If DAZ is set \#DE is never set. */
14490 if ( fMxcsr & X86_MXCSR_DAZ
14491 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14492 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14493 fXcpt &= ~X86_MXCSR_DE;
14494
14495 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14496}
14497
14498
14499/**
14500 * Helper for transfering exception to MXCSR and setting the result value
14501 * accordingly - ignores Flush-to-Zero.
14502 *
14503 * @returns Updated MXCSR.
14504 * @param pSoftState The SoftFloat state following the operation.
14505 * @param r32Result The result of the SoftFloat operation.
14506 * @param pr32Result Where to store the result for IEM.
14507 * @param fMxcsr The original MXCSR value.
14508 */
14509DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14510 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14511{
14512 iemFpSoftF32ToIprt(pr32Result, r32Result);
14513
14514 uint8_t fXcpt = pSoftState->exceptionFlags;
14515 /* If DAZ is set \#DE is never set. */
14516 if ( fMxcsr & X86_MXCSR_DAZ
14517 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14518 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14519 fXcpt &= ~X86_MXCSR_DE;
14520
14521 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14522}
14523
14524
14525/**
14526 * Helper for transfering exception to MXCSR and setting the result value
14527 * accordingly.
14528 *
14529 * @returns Updated MXCSR.
14530 * @param pSoftState The SoftFloat state following the operation.
14531 * @param r64Result The result of the SoftFloat operation.
14532 * @param pr64Result Where to store the result for IEM.
14533 * @param fMxcsr The original MXCSR value.
14534 */
14535DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14536 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14537{
14538 iemFpSoftF64ToIprt(pr64Result, r64Result);
14539 uint8_t fXcpt = pSoftState->exceptionFlags;
14540 if ( (fMxcsr & X86_MXCSR_FZ)
14541 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14542 {
14543 /* Underflow masked and flush to zero is set. */
14544 iemFpSoftF64ToIprt(pr64Result, r64Result);
14545 pr64Result->s.uFractionHigh = 0;
14546 pr64Result->s.uFractionLow = 0;
14547 pr64Result->s.uExponent = 0;
14548 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14549 }
14550
14551 /* If DAZ is set \#DE is never set. */
14552 if ( fMxcsr & X86_MXCSR_DAZ
14553 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14554 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14555 fXcpt &= ~X86_MXCSR_DE;
14556
14557 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14558}
14559
14560
14561/**
14562 * Helper for transfering exception to MXCSR and setting the result value
14563 * accordingly - ignores Flush-to-Zero.
14564 *
14565 * @returns Updated MXCSR.
14566 * @param pSoftState The SoftFloat state following the operation.
14567 * @param r64Result The result of the SoftFloat operation.
14568 * @param pr64Result Where to store the result for IEM.
14569 * @param fMxcsr The original MXCSR value.
14570 */
14571DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14572 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14573{
14574 iemFpSoftF64ToIprt(pr64Result, r64Result);
14575
14576 uint8_t fXcpt = pSoftState->exceptionFlags;
14577 /* If DAZ is set \#DE is never set. */
14578 if ( fMxcsr & X86_MXCSR_DAZ
14579 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14580 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14581 fXcpt &= ~X86_MXCSR_DE;
14582
14583 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14584}
14585
14586#endif /* IEM_WITHOUT_ASSEMBLY */
14587
14588
14589/**
14590 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14591 * in MXCSR into account.
14592 *
14593 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14594 * @param pr32Val Where to store the result.
14595 * @param fMxcsr The input MXCSR value.
14596 * @param pr32Src The value to use.
14597 */
14598DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14599{
14600 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14601 {
14602 if (fMxcsr & X86_MXCSR_DAZ)
14603 {
14604 /* De-normals are changed to 0. */
14605 pr32Val->s.fSign = pr32Src->s.fSign;
14606 pr32Val->s.uFraction = 0;
14607 pr32Val->s.uExponent = 0;
14608 return 0;
14609 }
14610
14611 *pr32Val = *pr32Src;
14612 return X86_MXCSR_DE;
14613 }
14614
14615 *pr32Val = *pr32Src;
14616 return 0;
14617}
14618
14619
14620/**
14621 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14622 * in MXCSR into account.
14623 *
14624 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14625 * @param pr64Val Where to store the result.
14626 * @param fMxcsr The input MXCSR value.
14627 * @param pr64Src The value to use.
14628 */
14629DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14630{
14631 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14632 {
14633 if (fMxcsr & X86_MXCSR_DAZ)
14634 {
14635 /* De-normals are changed to 0. */
14636 pr64Val->s64.fSign = pr64Src->s.fSign;
14637 pr64Val->s64.uFraction = 0;
14638 pr64Val->s64.uExponent = 0;
14639 return 0;
14640 }
14641
14642 *pr64Val = *pr64Src;
14643 return X86_MXCSR_DE;
14644 }
14645
14646 *pr64Val = *pr64Src;
14647 return 0;
14648}
14649
14650#ifdef IEM_WITHOUT_ASSEMBLY
14651
14652/**
14653 * Validates the given input operands returning whether the operation can continue or whether one
14654 * of the source operands contains a NaN value, setting the output accordingly.
14655 *
14656 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14657 * @param pr32Res Where to store the result in case the operation can't continue.
14658 * @param pr32Val1 The first input operand.
14659 * @param pr32Val2 The second input operand.
14660 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14661 */
14662DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14663{
14664 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14665 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14666 if (cSNan + cQNan == 2)
14667 {
14668 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14669 *pr32Res = *pr32Val1;
14670 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14671 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14672 return true;
14673 }
14674 if (cSNan)
14675 {
14676 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14677 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14678 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14679 *pfMxcsr |= X86_MXCSR_IE;
14680 return true;
14681 }
14682 if (cQNan)
14683 {
14684 /* The QNan operand is placed into the result. */
14685 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14686 return true;
14687 }
14688
14689 Assert(!cQNan && !cSNan);
14690 return false;
14691}
14692
14693
14694/**
14695 * Validates the given double precision input operands returning whether the operation can continue or whether one
14696 * of the source operands contains a NaN value, setting the output accordingly.
14697 *
14698 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14699 * @param pr64Res Where to store the result in case the operation can't continue.
14700 * @param pr64Val1 The first input operand.
14701 * @param pr64Val2 The second input operand.
14702 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14703 */
14704DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14705{
14706 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14707 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14708 if (cSNan + cQNan == 2)
14709 {
14710 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14711 *pr64Res = *pr64Val1;
14712 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14713 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14714 return true;
14715 }
14716 if (cSNan)
14717 {
14718 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14719 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14720 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14721 *pfMxcsr |= X86_MXCSR_IE;
14722 return true;
14723 }
14724 if (cQNan)
14725 {
14726 /* The QNan operand is placed into the result. */
14727 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14728 return true;
14729 }
14730
14731 Assert(!cQNan && !cSNan);
14732 return false;
14733}
14734
14735
14736/**
14737 * Validates the given single input operand returning whether the operation can continue or whether
14738 * contains a NaN value, setting the output accordingly.
14739 *
14740 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14741 * @param pr32Res Where to store the result in case the operation can't continue.
14742 * @param pr32Val The input operand.
14743 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14744 */
14745DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14746{
14747 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14748 {
14749 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14750 *pr32Res = *pr32Val;
14751 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14752 *pfMxcsr |= X86_MXCSR_IE;
14753 return true;
14754 }
14755 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14756 {
14757 /* The QNan operand is placed into the result. */
14758 *pr32Res = *pr32Val;
14759 return true;
14760 }
14761
14762 return false;
14763}
14764
14765
14766/**
14767 * Validates the given double input operand returning whether the operation can continue or whether
14768 * contains a NaN value, setting the output accordingly.
14769 *
14770 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14771 * @param pr64Res Where to store the result in case the operation can't continue.
14772 * @param pr64Val The input operand.
14773 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14774 */
14775DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14776{
14777 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14778 {
14779 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14780 *pr64Res = *pr64Val;
14781 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14782 *pfMxcsr |= X86_MXCSR_IE;
14783 return true;
14784 }
14785 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14786 {
14787 /* The QNan operand is placed into the result. */
14788 *pr64Res = *pr64Val;
14789 return true;
14790 }
14791
14792 return false;
14793}
14794
14795#endif /* IEM_WITHOUT_ASSEMBLY */
14796
14797/**
14798 * ADDPS
14799 */
14800#ifdef IEM_WITHOUT_ASSEMBLY
14801static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14802{
14803 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14804 return fMxcsr;
14805
14806 RTFLOAT32U r32Src1, r32Src2;
14807 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14808 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14809 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14810 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14811 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14812}
14813
14814
14815IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14816{
14817 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14818 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14819 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14820 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14821}
14822#endif
14823
14824
14825/**
14826 * ADDSS
14827 */
14828#ifdef IEM_WITHOUT_ASSEMBLY
14829IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14830{
14831 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14832 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14833 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14834 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14835}
14836#endif
14837
14838
14839/**
14840 * ADDPD
14841 */
14842#ifdef IEM_WITHOUT_ASSEMBLY
14843static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14844{
14845 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14846 return fMxcsr;
14847
14848 RTFLOAT64U r64Src1, r64Src2;
14849 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14850 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14851 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14852 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14853 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14854}
14855
14856
14857IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14858{
14859 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14860 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14861}
14862#endif
14863
14864
14865/**
14866 * ADDSD
14867 */
14868#ifdef IEM_WITHOUT_ASSEMBLY
14869IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14870{
14871 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14872 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14873}
14874#endif
14875
14876
14877/**
14878 * MULPS
14879 */
14880#ifdef IEM_WITHOUT_ASSEMBLY
14881static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14882{
14883 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14884 return fMxcsr;
14885
14886 RTFLOAT32U r32Src1, r32Src2;
14887 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14888 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14889 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14890 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14891 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14892}
14893
14894
14895IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14896{
14897 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14898 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14899 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14900 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14901}
14902#endif
14903
14904
14905/**
14906 * MULSS
14907 */
14908#ifdef IEM_WITHOUT_ASSEMBLY
14909IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14910{
14911 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14912 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14913 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14914 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14915}
14916#endif
14917
14918
14919/**
14920 * MULPD
14921 */
14922#ifdef IEM_WITHOUT_ASSEMBLY
14923static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14924{
14925 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14926 return fMxcsr;
14927
14928 RTFLOAT64U r64Src1, r64Src2;
14929 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14930 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14931 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14932 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14933 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14934}
14935
14936
14937IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14938{
14939 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14940 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14941}
14942#endif
14943
14944
14945/**
14946 * MULSD
14947 */
14948#ifdef IEM_WITHOUT_ASSEMBLY
14949IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14950{
14951 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14952 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14953}
14954#endif
14955
14956
14957/**
14958 * SUBPS
14959 */
14960#ifdef IEM_WITHOUT_ASSEMBLY
14961static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14962{
14963 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14964 return fMxcsr;
14965
14966 RTFLOAT32U r32Src1, r32Src2;
14967 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14968 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14969 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14970 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14971 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14972}
14973
14974
14975IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14976{
14977 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14978 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14979 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14980 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14981}
14982#endif
14983
14984
14985/**
14986 * SUBSS
14987 */
14988#ifdef IEM_WITHOUT_ASSEMBLY
14989IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14990{
14991 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14992 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14993 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14994 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14995}
14996#endif
14997
14998
14999/**
15000 * SUBPD
15001 */
15002#ifdef IEM_WITHOUT_ASSEMBLY
15003static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15004{
15005 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15006 return fMxcsr;
15007
15008 RTFLOAT64U r64Src1, r64Src2;
15009 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15010 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15011 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15012 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15013 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15014}
15015
15016
15017IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15018{
15019 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15020 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15021}
15022#endif
15023
15024
15025/**
15026 * SUBSD
15027 */
15028#ifdef IEM_WITHOUT_ASSEMBLY
15029IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15030{
15031 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15032 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15033}
15034#endif
15035
15036
15037/**
15038 * MINPS
15039 */
15040#ifdef IEM_WITHOUT_ASSEMBLY
15041static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15042{
15043 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15044 {
15045 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15046 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15047 return fMxcsr | X86_MXCSR_IE;
15048 }
15049
15050 RTFLOAT32U r32Src1, r32Src2;
15051 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15052 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15053 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15054 {
15055 *pr32Res = r32Src2;
15056 return fMxcsr;
15057 }
15058
15059 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15060 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15061 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15062 fLe
15063 ? iemFpSoftF32FromIprt(&r32Src1)
15064 : iemFpSoftF32FromIprt(&r32Src2),
15065 pr32Res, fMxcsr);
15066}
15067
15068
15069IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15070{
15071 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15072 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15073 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15074 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15075}
15076#endif
15077
15078
15079/**
15080 * MINSS
15081 */
15082#ifdef IEM_WITHOUT_ASSEMBLY
15083IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15084{
15085 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15086 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15087 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15088 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15089}
15090#endif
15091
15092
15093/**
15094 * MINPD
15095 */
15096#ifdef IEM_WITHOUT_ASSEMBLY
15097static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15098{
15099 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15100 {
15101 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15102 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15103 return fMxcsr | X86_MXCSR_IE;
15104 }
15105
15106 RTFLOAT64U r64Src1, r64Src2;
15107 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15108 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15109 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15110 {
15111 *pr64Res = r64Src2;
15112 return fMxcsr;
15113 }
15114
15115 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15116 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15117 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15118 fLe
15119 ? iemFpSoftF64FromIprt(&r64Src1)
15120 : iemFpSoftF64FromIprt(&r64Src2),
15121 pr64Res, fMxcsr);
15122}
15123
15124
15125IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15126{
15127 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15128 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15129}
15130#endif
15131
15132
15133/**
15134 * MINSD
15135 */
15136#ifdef IEM_WITHOUT_ASSEMBLY
15137IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15138{
15139 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15140 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15141}
15142#endif
15143
15144
15145/**
15146 * DIVPS
15147 */
15148#ifdef IEM_WITHOUT_ASSEMBLY
15149static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15150{
15151 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15152 return fMxcsr;
15153
15154 RTFLOAT32U r32Src1, r32Src2;
15155 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15156 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15157 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15158 {
15159 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15160 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15161 {
15162 *pr32Res = g_ar32QNaN[1];
15163 return fMxcsr | X86_MXCSR_IE;
15164 }
15165 else if (RTFLOAT32U_IS_INF(&r32Src1))
15166 {
15167 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15168 return fMxcsr;
15169 }
15170 else
15171 {
15172 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15173 return fMxcsr | X86_MXCSR_ZE;
15174 }
15175 }
15176
15177 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15178 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15179 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15180}
15181
15182
15183IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15184{
15185 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15186 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15187 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15188 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15189}
15190#endif
15191
15192
15193/**
15194 * DIVSS
15195 */
15196#ifdef IEM_WITHOUT_ASSEMBLY
15197IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15198{
15199 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15200 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15201 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15202 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15203}
15204#endif
15205
15206
15207/**
15208 * DIVPD
15209 */
15210#ifdef IEM_WITHOUT_ASSEMBLY
15211static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15212{
15213 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15214 return fMxcsr;
15215
15216 RTFLOAT64U r64Src1, r64Src2;
15217 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15218 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15219 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15220 {
15221 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15222 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15223 {
15224 *pr64Res = g_ar64QNaN[1];
15225 return fMxcsr | X86_MXCSR_IE;
15226 }
15227 else if (RTFLOAT64U_IS_INF(&r64Src1))
15228 {
15229 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15230 return fMxcsr;
15231 }
15232 else
15233 {
15234 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15235 return fMxcsr | X86_MXCSR_ZE;
15236 }
15237 }
15238
15239 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15240 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15241 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15242}
15243
15244
15245IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15246{
15247 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15248 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15249}
15250#endif
15251
15252
15253/**
15254 * DIVSD
15255 */
15256#ifdef IEM_WITHOUT_ASSEMBLY
15257IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15258{
15259 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15260 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15261}
15262#endif
15263
15264
15265/**
15266 * MAXPS
15267 */
15268#ifdef IEM_WITHOUT_ASSEMBLY
15269static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15270{
15271 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15272 {
15273 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15274 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15275 return fMxcsr | X86_MXCSR_IE;
15276 }
15277
15278 RTFLOAT32U r32Src1, r32Src2;
15279 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15280 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15281 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15282 {
15283 *pr32Res = r32Src2;
15284 return fMxcsr;
15285 }
15286
15287 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15288 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15289 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15290 fLe
15291 ? iemFpSoftF32FromIprt(&r32Src2)
15292 : iemFpSoftF32FromIprt(&r32Src1),
15293 pr32Res, fMxcsr);
15294}
15295
15296
15297IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15298{
15299 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15300 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15301 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15302 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15303}
15304#endif
15305
15306
15307/**
15308 * MAXSS
15309 */
15310#ifdef IEM_WITHOUT_ASSEMBLY
15311IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15312{
15313 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15314 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15315 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15316 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15317}
15318#endif
15319
15320
15321/**
15322 * MAXPD
15323 */
15324#ifdef IEM_WITHOUT_ASSEMBLY
15325static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15326{
15327 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15328 {
15329 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15330 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15331 return fMxcsr | X86_MXCSR_IE;
15332 }
15333
15334 RTFLOAT64U r64Src1, r64Src2;
15335 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15336 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15337 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15338 {
15339 *pr64Res = r64Src2;
15340 return fMxcsr;
15341 }
15342
15343 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15344 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15345 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15346 fLe
15347 ? iemFpSoftF64FromIprt(&r64Src2)
15348 : iemFpSoftF64FromIprt(&r64Src1),
15349 pr64Res, fMxcsr);
15350}
15351
15352
15353IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15354{
15355 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15356 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15357}
15358#endif
15359
15360
15361/**
15362 * MAXSD
15363 */
15364#ifdef IEM_WITHOUT_ASSEMBLY
15365IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15366{
15367 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15368 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15369}
15370#endif
15371
15372
15373/**
15374 * CVTSS2SD
15375 */
15376#ifdef IEM_WITHOUT_ASSEMBLY
15377static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15378{
15379 RTFLOAT32U r32Src1;
15380 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15381
15382 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15383 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15384 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15385}
15386
15387
15388IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15389{
15390 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15391 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15392}
15393#endif
15394
15395
15396/**
15397 * CVTSD2SS
15398 */
15399#ifdef IEM_WITHOUT_ASSEMBLY
15400static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15401{
15402 RTFLOAT64U r64Src1;
15403 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15404
15405 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15406 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15407 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15408}
15409
15410
15411IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15412{
15413 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15414 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15415 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15416 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15417}
15418#endif
15419
15420
15421/**
15422 * HADDPS
15423 */
15424#ifdef IEM_WITHOUT_ASSEMBLY
15425IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15426{
15427 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15428 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15429 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15430 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15431}
15432#endif
15433
15434
15435/**
15436 * HADDPD
15437 */
15438#ifdef IEM_WITHOUT_ASSEMBLY
15439IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15440{
15441 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15442 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15443}
15444#endif
15445
15446
15447/**
15448 * HSUBPS
15449 */
15450#ifdef IEM_WITHOUT_ASSEMBLY
15451IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15452{
15453 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15454 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15455 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15456 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15457}
15458#endif
15459
15460
15461/**
15462 * HSUBPD
15463 */
15464#ifdef IEM_WITHOUT_ASSEMBLY
15465IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15466{
15467 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15468 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15469}
15470#endif
15471
15472
15473/**
15474 * SQRTPS
15475 */
15476#ifdef IEM_WITHOUT_ASSEMBLY
15477static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15478{
15479 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15480 return fMxcsr;
15481
15482 RTFLOAT32U r32Src;
15483 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15484 if (RTFLOAT32U_IS_ZERO(&r32Src))
15485 {
15486 *pr32Res = r32Src;
15487 return fMxcsr;
15488 }
15489 else if (r32Src.s.fSign)
15490 {
15491 *pr32Res = g_ar32QNaN[1];
15492 return fMxcsr | X86_MXCSR_IE;
15493 }
15494
15495 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15496 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15497 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15498}
15499
15500
15501IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15502{
15503 RT_NOREF(puSrc1);
15504
15505 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15506 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15507 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15508 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15509}
15510#endif
15511
15512
15513/**
15514 * SQRTSS
15515 */
15516#ifdef IEM_WITHOUT_ASSEMBLY
15517IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15518{
15519 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15520 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15521 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15522 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15523}
15524#endif
15525
15526
15527/**
15528 * SQRTPD
15529 */
15530#ifdef IEM_WITHOUT_ASSEMBLY
15531static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15532{
15533 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15534 return fMxcsr;
15535
15536 RTFLOAT64U r64Src;
15537 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15538 if (RTFLOAT64U_IS_ZERO(&r64Src))
15539 {
15540 *pr64Res = r64Src;
15541 return fMxcsr;
15542 }
15543 else if (r64Src.s.fSign)
15544 {
15545 *pr64Res = g_ar64QNaN[1];
15546 return fMxcsr | X86_MXCSR_IE;
15547 }
15548
15549 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15550 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15551 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15552}
15553
15554
15555IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15556{
15557 RT_NOREF(puSrc1);
15558
15559 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15560 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15561}
15562#endif
15563
15564
15565/**
15566 * SQRTSD
15567 */
15568#ifdef IEM_WITHOUT_ASSEMBLY
15569IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15570{
15571 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15572 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15573}
15574#endif
15575
15576
15577#ifdef IEM_WITHOUT_ASSEMBLY
15578/**
15579 * RSQRTPS
15580 */
15581static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15582{
15583 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15584 return fMxcsr;
15585
15586 RTFLOAT32U r32Src;
15587 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
15588 if (RTFLOAT32U_IS_ZERO(&r32Src))
15589 {
15590 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
15591 return fMxcsr;
15592 }
15593 else if (r32Src.s.fSign)
15594 {
15595 *pr32Res = g_ar32QNaN[1];
15596 return fMxcsr | X86_MXCSR_IE;
15597 }
15598
15599 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15600 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15601 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15602}
15603
15604
15605IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15606{
15607 RT_NOREF(puSrc1);
15608
15609 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15610 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15611 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15612 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15613}
15614
15615
15616/**
15617 * RSQRTSS
15618 */
15619IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15620{
15621 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15622 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15623 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15624 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15625}
15626#endif
15627
15628
15629/**
15630 * ADDSUBPS
15631 */
15632#ifdef IEM_WITHOUT_ASSEMBLY
15633IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15634{
15635 RT_NOREF(puSrc1);
15636
15637 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15638 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15639 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15640 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15641}
15642#endif
15643
15644
15645/**
15646 * ADDSUBPD
15647 */
15648#ifdef IEM_WITHOUT_ASSEMBLY
15649IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15650{
15651 RT_NOREF(puSrc1);
15652
15653 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15654 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15655}
15656#endif
15657
15658
15659/**
15660 * CVTPD2PS
15661 */
15662#ifdef IEM_WITHOUT_ASSEMBLY
15663static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15664{
15665 RTFLOAT64U r64Src1;
15666 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15667
15668 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15669 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15670 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15671}
15672
15673
15674IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15675{
15676 RT_NOREF(puSrc1);
15677
15678 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15679 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15680 pResult->uResult.au32[2] = 0;
15681 pResult->uResult.au32[3] = 0;
15682}
15683#endif
15684
15685
15686/**
15687 * CVTPS2PD
15688 */
15689#ifdef IEM_WITHOUT_ASSEMBLY
15690static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15691{
15692 RTFLOAT32U r32Src1;
15693 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15694
15695 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15696 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15697 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15698}
15699
15700
15701IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15702{
15703 RT_NOREF(puSrc1);
15704
15705 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15706 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15707}
15708#endif
15709
15710
15711/**
15712 * CVTDQ2PS
15713 */
15714#ifdef IEM_WITHOUT_ASSEMBLY
15715static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15716{
15717 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15718 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15719 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15720}
15721
15722
15723IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15724{
15725 RT_NOREF(puSrc1);
15726
15727 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15728 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15729 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15730 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15731}
15732#endif
15733
15734
15735/**
15736 * CVTPS2DQ
15737 */
15738#ifdef IEM_WITHOUT_ASSEMBLY
15739static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15740{
15741 RTFLOAT32U r32Src;
15742 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15743
15744 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15745 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15746 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15747}
15748
15749
15750IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15751{
15752 RT_NOREF(puSrc1);
15753
15754 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15755 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15756 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15757 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15758}
15759#endif
15760
15761
15762/**
15763 * CVTTPS2DQ
15764 */
15765#ifdef IEM_WITHOUT_ASSEMBLY
15766static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15767{
15768 RTFLOAT32U r32Src;
15769 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15770
15771 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15772 SoftState.roundingMode = softfloat_round_minMag;
15773 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15774 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15775}
15776
15777
15778IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15779{
15780 RT_NOREF(puSrc1);
15781
15782 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15783 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15784 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15785 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15786}
15787#endif
15788
15789
15790/**
15791 * CVTTPD2DQ
15792 */
15793#ifdef IEM_WITHOUT_ASSEMBLY
15794static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15795{
15796 RTFLOAT64U r64Src;
15797 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15798
15799 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15800 SoftState.roundingMode = softfloat_round_minMag;
15801 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15802 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15803}
15804
15805
15806IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15807{
15808 RT_NOREF(puSrc1);
15809
15810 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15811 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15812 pResult->uResult.au64[1] = 0;
15813}
15814#endif
15815
15816
15817/**
15818 * CVTDQ2PD
15819 */
15820#ifdef IEM_WITHOUT_ASSEMBLY
15821static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15822{
15823 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15824 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15825 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15826}
15827
15828
15829IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15830{
15831 RT_NOREF(puSrc1);
15832
15833 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15834 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15835}
15836#endif
15837
15838
15839/**
15840 * CVTPD2DQ
15841 */
15842#ifdef IEM_WITHOUT_ASSEMBLY
15843static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15844{
15845 RTFLOAT64U r64Src;
15846 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15847
15848 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15849 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15850 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15851}
15852
15853
15854IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15855{
15856 RT_NOREF(puSrc1);
15857
15858 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15859 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15860 pResult->uResult.au64[1] = 0;
15861}
15862#endif
15863
15864
15865/**
15866 * [V]SHUFPS
15867 */
15868#ifdef IEM_WITHOUT_ASSEMBLY
15869IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15870{
15871 RTUINT128U const uSrc1 = *puDst;
15872 RTUINT128U const uSrc2 = *puSrc;
15873 ASMCompilerBarrier();
15874 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15875 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15876 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15877 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15878}
15879#endif
15880
15881
15882IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15883{
15884 RTUINT128U const uSrc1 = *puSrc1;
15885 RTUINT128U const uSrc2 = *puSrc2;
15886 ASMCompilerBarrier();
15887 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15888 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15889 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15890 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15891}
15892
15893
15894IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15895{
15896 RTUINT256U const uSrc1 = *puSrc1;
15897 RTUINT256U const uSrc2 = *puSrc2;
15898 ASMCompilerBarrier();
15899 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15900 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15901 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15902 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15903
15904 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15905 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15906 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15907 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15908}
15909
15910
15911/**
15912 * [V]SHUFPD
15913 */
15914#ifdef IEM_WITHOUT_ASSEMBLY
15915IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15916{
15917 RTUINT128U const uSrc1 = *puDst;
15918 RTUINT128U const uSrc2 = *puSrc;
15919 ASMCompilerBarrier();
15920 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15921 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15922}
15923#endif
15924
15925
15926IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15927{
15928 RTUINT128U const uSrc1 = *puSrc1;
15929 RTUINT128U const uSrc2 = *puSrc2;
15930 ASMCompilerBarrier();
15931 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15932 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15933}
15934
15935
15936IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15937{
15938 RTUINT256U const uSrc1 = *puSrc1;
15939 RTUINT256U const uSrc2 = *puSrc2;
15940 ASMCompilerBarrier();
15941 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15942 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15943 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15944 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15945}
15946
15947
15948/*
15949 * PHMINPOSUW / VPHMINPOSUW
15950 */
15951IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15952{
15953 uint16_t u16Min = puSrc->au16[0];
15954 uint8_t idxMin = 0;
15955
15956 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15957 if (puSrc->au16[i] < u16Min)
15958 {
15959 u16Min = puSrc->au16[i];
15960 idxMin = i;
15961 }
15962
15963 puDst->au64[0] = 0;
15964 puDst->au64[1] = 0;
15965 puDst->au16[0] = u16Min;
15966 puDst->au16[1] = idxMin;
15967}
15968
15969
15970IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15971{
15972 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15973}
15974
15975
15976/*
15977 * [V]PBLENDVB
15978 */
15979IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15980{
15981 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15982 if (puMask->au8[i] & RT_BIT(7))
15983 puDst->au8[i] = puSrc->au8[i];
15984}
15985
15986
15987IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15988{
15989 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15990 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15991}
15992
15993
15994IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15995{
15996 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15997 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15998}
15999
16000
16001/*
16002 * [V]BLENDVPS
16003 */
16004IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16005{
16006 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16007 if (puMask->au32[i] & RT_BIT_32(31))
16008 puDst->au32[i] = puSrc->au32[i];
16009}
16010
16011
16012IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16013{
16014 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16015 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16016}
16017
16018
16019IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16020{
16021 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16022 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16023}
16024
16025
16026/*
16027 * [V]BLENDVPD
16028 */
16029IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16030{
16031 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16032 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16033}
16034
16035
16036IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16037{
16038 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16039 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16040}
16041
16042
16043IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16044{
16045 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16046 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16047}
16048
16049
16050/**
16051 * [V]PALIGNR
16052 */
16053IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16054{
16055 uint64_t const u64Src1 = *pu64Dst;
16056 ASMCompilerBarrier();
16057
16058 if (bEvil >= 16)
16059 *pu64Dst = 0;
16060 else if (bEvil >= 8)
16061 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16062 else
16063 {
16064 uint8_t cShift = bEvil * 8;
16065 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16066 | (u64Src2 >> cShift);
16067 }
16068}
16069
16070
16071IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16072{
16073 RTUINT128U const uSrc1 = *puDst;
16074 RTUINT128U const uSrc2 = *puSrc;
16075 ASMCompilerBarrier();
16076
16077 puDst->au64[0] = 0;
16078 puDst->au64[1] = 0;
16079 if (bEvil >= 32)
16080 { /* Everything stays 0. */ }
16081 else if (bEvil >= 16)
16082 {
16083 bEvil -= 16;
16084 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16085 puDst->au8[i - bEvil] = uSrc1.au8[i];
16086 }
16087 else
16088 {
16089 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16090 puDst->au8[i] = uSrc2.au8[i + bEvil];
16091 for (uint8_t i = 0; i < bEvil; i++)
16092 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16093 }
16094}
16095
16096
16097IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16098{
16099 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16100 RTUINT128U const uSrc2 = *puSrc2;
16101 ASMCompilerBarrier();
16102
16103 puDst->au64[0] = 0;
16104 puDst->au64[1] = 0;
16105 if (bEvil >= 32)
16106 { /* Everything stays 0. */ }
16107 else if (bEvil >= 16)
16108 {
16109 bEvil -= 16;
16110 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16111 puDst->au8[i - bEvil] = uSrc1.au8[i];
16112 }
16113 else
16114 {
16115 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16116 puDst->au8[i] = uSrc2.au8[i + bEvil];
16117 for (uint8_t i = 0; i < bEvil; i++)
16118 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16119 }
16120}
16121
16122
16123IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16124{
16125 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16126 RTUINT256U const uSrc2 = *puSrc2;
16127 ASMCompilerBarrier();
16128
16129 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16130 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16131}
16132
16133
16134/**
16135 * [V]PBLENDW
16136 */
16137IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16138{
16139 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16140 if (bEvil & RT_BIT(i))
16141 puDst->au16[i] = puSrc->au16[i];
16142}
16143
16144
16145IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16146{
16147 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16148 if (bEvil & RT_BIT(i))
16149 puDst->au16[i] = puSrc2->au16[i];
16150 else
16151 puDst->au16[i] = puSrc1->au16[i];
16152}
16153
16154
16155IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16156{
16157 for (uint8_t i = 0; i < 8; i++)
16158 if (bEvil & RT_BIT(i))
16159 {
16160 puDst->au16[ i] = puSrc2->au16[ i];
16161 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16162 }
16163 else
16164 {
16165 puDst->au16[ i] = puSrc1->au16[ i];
16166 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16167 }
16168}
16169
16170
16171/**
16172 * [V]BLENDPS
16173 */
16174IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16175{
16176 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16177 if (bEvil & RT_BIT(i))
16178 puDst->au32[i] = puSrc->au32[i];
16179}
16180
16181
16182IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16183{
16184 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16185 if (bEvil & RT_BIT(i))
16186 puDst->au32[i] = puSrc2->au32[i];
16187 else
16188 puDst->au32[i] = puSrc1->au32[i];
16189}
16190
16191
16192IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16193{
16194 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16195 if (bEvil & RT_BIT(i))
16196 puDst->au32[i] = puSrc2->au32[i];
16197 else
16198 puDst->au32[i] = puSrc1->au32[i];
16199}
16200
16201
16202/**
16203 * [V]BLENDPD
16204 */
16205IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16206{
16207 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16208 if (bEvil & RT_BIT(i))
16209 puDst->au64[i] = puSrc->au64[i];
16210}
16211
16212
16213IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16214{
16215 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16216 if (bEvil & RT_BIT(i))
16217 puDst->au64[i] = puSrc2->au64[i];
16218 else
16219 puDst->au64[i] = puSrc1->au64[i];
16220}
16221
16222
16223IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16224{
16225 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16226 if (bEvil & RT_BIT(i))
16227 puDst->au64[i] = puSrc2->au64[i];
16228 else
16229 puDst->au64[i] = puSrc1->au64[i];
16230}
16231
16232
16233/**
16234 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16235 */
16236
16237static uint8_t iemAImpl_aes_sbox[] = {
16238 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16239 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16240 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16241 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16242 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16243 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16244 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16245 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16246 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16247 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16248 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16249 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16250 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16251 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16252 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16253 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16254};
16255
16256/* The InvS-Box lookup table. */
16257static uint8_t iemAImpl_aes_inv_sbox[] = {
16258 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16259 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16260 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16261 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16262 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16263 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16264 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16265 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16266 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16267 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16268 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16269 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16270 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16271 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16272 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16273 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16274};
16275
16276/* The ShiftRows lookup table. */
16277static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16278 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16279};
16280
16281/* The InvShiftRows lookup table. */
16282static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16283 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16284};
16285
16286static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16287{
16288 RTUINT128U uVal;
16289 int i;
16290
16291 for (i = 0; i < 16; ++i)
16292 uVal.au8[i] = abSubst[puSrc->au8[i]];
16293
16294 return uVal;
16295}
16296
16297static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16298{
16299 return (u << 1) ^ (((u >> 7) & 1) * 27);
16300}
16301
16302static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16303{
16304 RTUINT128U uVal;
16305 int i;
16306 uint8_t tmp;
16307
16308 for (i = 0; i < 16; i += 4) {
16309 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16310 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16311 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16312 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16313 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16314 }
16315
16316 return uVal;
16317}
16318
16319static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16320{
16321 RTUINT128U uVal;
16322 int i;
16323
16324 for (i = 0; i < 16; ++i)
16325 uVal.au8[i] = puSrc->au8[abShift[i]];
16326
16327 return uVal;
16328}
16329
16330static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16331{
16332 uint8_t val;
16333
16334 val = ((b >> 0) & 1) * a;
16335 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16336 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16337 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16338 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16339
16340 return val;
16341}
16342
16343static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16344{
16345 RTUINT128U uVal;
16346 int i;
16347
16348 for (i = 0; i < 16; i += 4) {
16349 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16350 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16351 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16352 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16353 }
16354
16355 return uVal;
16356}
16357
16358static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16359{
16360 RTUINT32U uTmp;
16361
16362 uTmp.au32[0] = w;
16363 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16364 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16365 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16366 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16367
16368 return uTmp.au32[0];
16369}
16370
16371static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16372{
16373 return (w << 24) | (w >> 8);
16374}
16375
16376/**
16377 * [V]AESKEYGENASSIST
16378 */
16379IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16380{
16381 RTUINT128U uTmp;
16382 uint32_t uRCon = bImm; /* Round constant. */
16383
16384 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16385 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16386 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16387 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16388
16389 *puDst = uTmp;
16390}
16391
16392
16393/**
16394 * [V]AESIMC
16395 */
16396IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16397{
16398 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16399}
16400
16401
16402/**
16403 * [V]AESENC
16404 */
16405IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16406{
16407 RTUINT128U uTmp;
16408
16409 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16410 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16411 uTmp = iemAImpl_aes_mix_col(&uTmp);
16412 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16413 uTmp.au64[1] ^= puSrc->au64[1];
16414
16415 *puDst = uTmp;
16416}
16417
16418
16419/**
16420 * [V]AESENCLAST
16421 */
16422IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16423{
16424 RTUINT128U uTmp;
16425
16426 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16427 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16428 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16429 uTmp.au64[1] ^= puSrc->au64[1];
16430
16431 *puDst = uTmp;
16432}
16433
16434
16435/**
16436 * [V]AESDEC
16437 */
16438IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16439{
16440 RTUINT128U uTmp;
16441
16442 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16443 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16444 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16445 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16446 uTmp.au64[1] ^= puSrc->au64[1];
16447
16448 *puDst = uTmp;
16449}
16450
16451
16452/**
16453 * [V]AESDECLAST
16454 */
16455IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16456{
16457 RTUINT128U uTmp;
16458
16459 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16460 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16461 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16462 uTmp.au64[1] ^= puSrc->au64[1];
16463
16464 *puDst = uTmp;
16465}
16466
16467
16468/**
16469 * [V]PCMPISTRI
16470 */
16471
16472/**
16473 * Does the comparisons based on the mode and source input format.
16474 */
16475static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
16476{
16477#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
16478 do \
16479 { \
16480 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
16481 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
16482 { \
16483 switch (a_bAggOp) \
16484 { \
16485 case 0: \
16486 case 2: \
16487 case 3: \
16488 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16489 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16490 break; \
16491 case 1: \
16492 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16493 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16494 break; \
16495 default: \
16496 AssertReleaseFailed(); \
16497 } \
16498 } \
16499 } while(0)
16500
16501 uint8_t bAggOp = (bImm >> 2) & 0x3;
16502 switch (bImm & 0x3)
16503 {
16504 case 0:
16505 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
16506 break;
16507 case 1:
16508 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
16509 break;
16510 case 2:
16511 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
16512 break;
16513 case 3:
16514 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
16515 break;
16516 default:
16517 AssertReleaseFailed();
16518 }
16519#undef PCMPXSTRX_CMP_CASE
16520}
16521
16522static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
16523{
16524 if (bImm & 0x1)
16525 {
16526 /* Words -> 8 elements. */
16527 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
16528 if (puSrc->au16[i] == 0)
16529 return i;
16530
16531 return 8;
16532 }
16533 else
16534 {
16535 /* Bytes -> 16 elements. */
16536 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
16537 if (puSrc->au8[i] == 0)
16538 return i;
16539
16540 return 16;
16541 }
16542}
16543
16544static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
16545{
16546 if (bImm & 0x1)
16547 {
16548 if (i64Len > -8 && i64Len < 8)
16549 return RT_ABS(i64Len);
16550
16551 return 8;
16552 }
16553 else
16554 {
16555 if (i64Len > -16 && i64Len < 16)
16556 return RT_ABS(i64Len);
16557
16558 return 16;
16559 }
16560}
16561
16562/**
16563 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
16564 */
16565static const bool g_afCmpOverride[4][3] =
16566{
16567 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
16568 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
16569 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
16570 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
16571 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
16572};
16573
16574DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
16575{
16576 if (fSrc1Valid && fSrc2Valid)
16577 return fCmpRes;
16578
16579 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
16580 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
16581 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
16582}
16583
16584static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
16585{
16586 uint8_t bAggOp = (bImm >> 2) & 0x3;
16587 uint16_t u16Result = 0;
16588
16589 switch (bAggOp)
16590 {
16591 case 0: /* Equal any */
16592 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16593 {
16594 uint16_t u16Res = 0;
16595 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
16596 {
16597 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16598 idxSrc1 < idxLen1,
16599 idxSrc2 < idxLen2,
16600 bAggOp))
16601 {
16602 u16Res = RT_BIT(idxSrc2);
16603 break;
16604 }
16605 }
16606
16607 u16Result |= u16Res;
16608 }
16609 break;
16610
16611 case 1: /* Ranges */
16612 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16613 {
16614 uint16_t u16Res = 0;
16615 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
16616 {
16617 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16618 idxSrc1 < idxLen1,
16619 idxSrc2 < idxLen2,
16620 bAggOp)
16621 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
16622 (idxSrc1 + 1) < idxLen1,
16623 idxSrc2 < idxLen2,
16624 bAggOp))
16625 {
16626 u16Res = RT_BIT(idxSrc2);
16627 break;
16628 }
16629 }
16630
16631 u16Result |= u16Res;
16632 }
16633 break;
16634
16635 case 2: /* Equal each */
16636 for (uint8_t i = 0; i < cElems; i++)
16637 {
16638 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
16639 i < idxLen1,
16640 i < idxLen2,
16641 bAggOp))
16642 u16Result |= RT_BIT(i);
16643 }
16644 break;
16645
16646 case 3: /* Equal ordered */
16647 u16Result = 0;
16648 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16649 {
16650 uint16_t u16Res = RT_BIT(idxSrc2);
16651 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
16652 {
16653 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
16654 idxSrc1 < idxLen1,
16655 k < idxLen2,
16656 bAggOp))
16657 {
16658 u16Res = 0;
16659 break;
16660 }
16661 }
16662
16663 u16Result |= u16Res;
16664 }
16665 break;
16666 }
16667
16668 /* Polarity selection. */
16669 switch ((bImm >> 4) & 0x3)
16670 {
16671 case 0:
16672 case 2:
16673 /* Nothing to do. */
16674 break;
16675 case 1:
16676 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
16677 break;
16678 case 3:
16679 u16Result ^= RT_BIT(idxLen2) - 1;
16680 break;
16681 default:
16682 AssertReleaseFailed();
16683 }
16684
16685 return u16Result;
16686}
16687
16688DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
16689{
16690 uint32_t fEFlags = 0;
16691
16692 if (u16Result)
16693 fEFlags |= X86_EFL_CF;
16694 if (cLen2 < cElems)
16695 fEFlags |= X86_EFL_ZF;
16696 if (cLen1 < cElems)
16697 fEFlags |= X86_EFL_SF;
16698 if (u16Result & 0x1)
16699 fEFlags |= X86_EFL_OF;
16700 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
16701}
16702
16703DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
16704 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
16705{
16706 bool afCmpRes[16][16];
16707 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16708
16709 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
16710 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
16711 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
16712
16713 return u16Result;
16714}
16715
16716DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
16717{
16718 if (bImm & RT_BIT(6))
16719 {
16720 /* Index for MSB set. */
16721 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
16722 if (idxMsb)
16723 *pu32Ecx = idxMsb - 1;
16724 else
16725 *pu32Ecx = cElems;
16726 }
16727 else
16728 {
16729 /* Index for LSB set. */
16730 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
16731 if (idxLsb)
16732 *pu32Ecx = idxLsb - 1;
16733 else
16734 *pu32Ecx = cElems;
16735 }
16736}
16737
16738IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
16739{
16740 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16741 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
16742 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
16743
16744 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16745 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
16746}
16747
16748
16749/**
16750 * [V]PCMPESTRI
16751 */
16752IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
16753{
16754 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16755 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
16756 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
16757
16758 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16759 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
16760}
16761
16762
16763/**
16764 * [V]PCMPISTRM
16765 */
16766DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
16767{
16768 if (bImm & RT_BIT(6))
16769 {
16770 /* Generate a mask. */
16771 if (cElems == 8)
16772 {
16773 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16774 if (u16Result & RT_BIT(i))
16775 puDst->au16[i] = 0xffff;
16776 else
16777 puDst->au16[i] = 0;
16778 }
16779 else
16780 {
16781 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16782 if (u16Result & RT_BIT(i))
16783 puDst->au8[i] = 0xff;
16784 else
16785 puDst->au8[i] = 0;
16786 }
16787 }
16788 else
16789 {
16790 /* Store the result. */
16791 puDst->au64[0] = u16Result;
16792 puDst->au64[1] = 0;
16793 }
16794}
16795
16796IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
16797{
16798 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16799 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
16800 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
16801
16802 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16803 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
16804}
16805
16806
16807/**
16808 * [V]PCMPESTRM
16809 */
16810IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
16811{
16812 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16813 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
16814 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
16815
16816 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
16817 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
16818}
16819
16820
16821/*
16822 * [V]PCLMULQDQ
16823 */
16824IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16825{
16826 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16827}
16828
16829
16830IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16831{
16832 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16833 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16834
16835 puDst->au64[0] = 0;
16836 puDst->au64[1] = 0;
16837
16838 /*
16839 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16840 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16841 * and squeeze out some optimizations.
16842 */
16843 if (uSrc1 & 0x1)
16844 puDst->au64[0] = uSrc2;
16845
16846 uSrc1 >>= 1;
16847
16848 uint8_t iDigit = 1;
16849 while (uSrc1)
16850 {
16851 if (uSrc1 & 0x1)
16852 {
16853 puDst->au64[0] ^= (uSrc2 << iDigit);
16854 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16855 }
16856
16857 uSrc1 >>= 1;
16858 iDigit++;
16859 }
16860}
16861
16862
16863/**
16864 * [V]PINSRW
16865 */
16866#ifdef IEM_WITHOUT_ASSEMBLY
16867IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16868{
16869 uint8_t cShift = (bEvil & 0x3) * 16;
16870 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16871}
16872
16873
16874IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16875{
16876 puDst->au16[bEvil & 0x7] = u16Src;
16877}
16878#endif
16879
16880
16881IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16882{
16883 *puDst = *puSrc;
16884 puDst->au16[bEvil & 0x7] = u16Src;
16885}
16886
16887
16888/**
16889 * [V]PEXTRW
16890 */
16891#ifdef IEM_WITHOUT_ASSEMBLY
16892IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16893{
16894 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16895}
16896
16897
16898IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16899{
16900 *pu16Dst = puSrc->au16[bEvil & 0x7];
16901}
16902
16903#endif
16904
16905IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16906{
16907 *pu16Dst = puSrc->au16[bEvil & 0x7];
16908}
16909
16910
16911/**
16912 * [V]MOVMSKPS
16913 */
16914#ifdef IEM_WITHOUT_ASSEMBLY
16915IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16916{
16917 *pu8Dst = puSrc->au32[0] >> 31;
16918 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16919 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16920 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16921}
16922
16923#endif
16924
16925IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16926{
16927 *pu8Dst = puSrc->au32[0] >> 31;
16928 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16929 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16930 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16931}
16932
16933
16934IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16935{
16936 *pu8Dst = puSrc->au32[0] >> 31;
16937 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16938 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16939 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16940 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
16941 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
16942 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
16943 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
16944}
16945
16946
16947/**
16948 * [V]MOVMSKPD
16949 */
16950#ifdef IEM_WITHOUT_ASSEMBLY
16951IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16952{
16953 *pu8Dst = puSrc->au64[0] >> 63;
16954 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16955}
16956
16957#endif
16958
16959IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16960{
16961 *pu8Dst = puSrc->au64[0] >> 63;
16962 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16963}
16964
16965
16966IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16967{
16968 *pu8Dst = puSrc->au64[0] >> 63;
16969 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16970 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
16971 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
16972}
16973
16974
16975/**
16976 * CVTTSD2SI
16977 */
16978#ifdef IEM_WITHOUT_ASSEMBLY
16979IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16980{
16981 RTFLOAT64U r64Src;
16982
16983 r64Src.u = *pu64Src;
16984 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16985
16986 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16987 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16988 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16989}
16990
16991
16992IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16993{
16994 RTFLOAT64U r64Src;
16995
16996 r64Src.u = *pu64Src;
16997 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16998
16999 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17000 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17001 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17002}
17003#endif
17004
17005
17006/**
17007 * CVTSD2SI
17008 */
17009#ifdef IEM_WITHOUT_ASSEMBLY
17010IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17011{
17012 RTFLOAT64U r64Src;
17013
17014 r64Src.u = *pu64Src;
17015 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17016
17017 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17018 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17019 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17020}
17021
17022
17023IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17024{
17025 RTFLOAT64U r64Src;
17026
17027 r64Src.u = *pu64Src;
17028 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17029
17030 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17031 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17032 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17033}
17034#endif
17035
17036
17037/**
17038 * CVTTSS2SI
17039 */
17040#ifdef IEM_WITHOUT_ASSEMBLY
17041IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17042{
17043 RTFLOAT32U r32Src;
17044
17045 r32Src.u = *pu32Src;
17046 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17047
17048 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17049 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17050 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17051}
17052
17053
17054IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17055{
17056 RTFLOAT32U r32Src;
17057
17058 r32Src.u = *pu32Src;
17059 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17060
17061 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17062 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17063 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17064}
17065#endif
17066
17067
17068/**
17069 * CVTSS2SI
17070 */
17071#ifdef IEM_WITHOUT_ASSEMBLY
17072IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17073{
17074 RTFLOAT32U r32Src;
17075
17076 r32Src.u = *pu32Src;
17077 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17078
17079 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17080 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17081 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17082}
17083
17084
17085IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17086{
17087 RTFLOAT32U r32Src;
17088
17089 r32Src.u = *pu32Src;
17090 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17091
17092 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17093 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17094 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17095}
17096#endif
17097
17098
17099/**
17100 * CVTSI2SD
17101 */
17102#ifdef IEM_WITHOUT_ASSEMBLY
17103IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17104{
17105 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17106 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17107 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17108}
17109
17110
17111IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17112{
17113 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17114 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17115 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17116}
17117#endif
17118
17119
17120/**
17121 * CVTSI2SS
17122 */
17123#ifdef IEM_WITHOUT_ASSEMBLY
17124IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17125{
17126 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17127 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17128 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17129}
17130
17131
17132IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17133{
17134 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17135 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17136 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17137}
17138#endif
17139
17140
17141/**
17142 * [V]UCOMISS
17143 */
17144#ifdef IEM_WITHOUT_ASSEMBLY
17145IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17146{
17147 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17148
17149 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17150 {
17151 *pfMxcsr |= X86_MXCSR_IE;
17152 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17153 }
17154 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17155 {
17156 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17157 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17158 }
17159 else
17160 {
17161 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17162
17163 RTFLOAT32U r32Src1, r32Src2;
17164 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17165 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17166
17167 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17168 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17169 if (f32_eq(f32Src1, f32Src2, &SoftState))
17170 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17171 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17172 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17173 /* else: GREATER_THAN 000 */
17174
17175 *pfMxcsr |= fDe;
17176 }
17177
17178 *pfEFlags = fEFlagsNew;
17179}
17180#endif
17181
17182IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17183{
17184 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17185}
17186
17187
17188/**
17189 * [V]UCOMISD
17190 */
17191#ifdef IEM_WITHOUT_ASSEMBLY
17192IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17193{
17194 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17195
17196 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17197 {
17198 *pfMxcsr |= X86_MXCSR_IE;
17199 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17200 }
17201 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17202 {
17203 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17204 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17205 }
17206 else
17207 {
17208 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17209
17210 RTFLOAT64U r64Src1, r64Src2;
17211 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17212 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17213
17214 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17215 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17216 if (f64_eq(f64Src1, f64Src2, &SoftState))
17217 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17218 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17219 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17220 /* else: GREATER_THAN 000 */
17221
17222 *pfMxcsr |= fDe;
17223 }
17224
17225 *pfEFlags = fEFlagsNew;
17226}
17227#endif
17228
17229IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17230{
17231 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17232}
17233
17234
17235/**
17236 * [V]COMISS
17237 */
17238#ifdef IEM_WITHOUT_ASSEMBLY
17239IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17240{
17241 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17242
17243 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17244 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17245 {
17246 *pfMxcsr |= X86_MXCSR_IE;
17247 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17248 }
17249 else
17250 {
17251 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17252
17253 RTFLOAT32U r32Src1, r32Src2;
17254 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17255 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17256
17257 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17258 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17259 if (f32_eq(f32Src1, f32Src2, &SoftState))
17260 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17261 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17262 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17263 /* else: GREATER_THAN 000 */
17264
17265 *pfMxcsr |= fDe;
17266 }
17267
17268 *pfEFlags = fEFlagsNew;
17269}
17270#endif
17271
17272
17273IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17274{
17275 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17276}
17277
17278
17279/**
17280 * [V]COMISD
17281 */
17282#ifdef IEM_WITHOUT_ASSEMBLY
17283IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17284{
17285 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17286
17287 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
17288 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17289 {
17290 *pfMxcsr |= X86_MXCSR_IE;
17291 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17292 }
17293 else
17294 {
17295 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17296
17297 RTFLOAT64U r64Src1, r64Src2;
17298 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17299 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17300
17301 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17302 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17303 if (f64_eq(f64Src1, f64Src2, &SoftState))
17304 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17305 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17306 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17307 /* else: GREATER_THAN 000 */
17308
17309 *pfMxcsr |= fDe;
17310 }
17311
17312 *pfEFlags = fEFlagsNew;
17313}
17314#endif
17315
17316IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17317{
17318 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17319}
17320
17321
17322/**
17323 * CMPPS / CMPPD / CMPSS / CMPSD
17324 */
17325#ifdef IEM_WITHOUT_ASSEMBLY
17326/**
17327 * A compare truth table entry.
17328 */
17329typedef struct CMPTRUTHTBLENTRY
17330{
17331 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
17332 bool fSignalsOnQNan;
17333 /** The boolean result when the input operands are unordered. */
17334 bool fUnordered;
17335 /** The boolean result when A = B. */
17336 bool fEqual;
17337 /** The boolean result when A < B. */
17338 bool fLowerThan;
17339 /** The boolean result when A > B. */
17340 bool fGreaterThan;
17341} CMPTRUTHTBLENTRY;
17342/** Pointer to a const truth table entry. */
17343typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
17344
17345
17346/** The compare truth table (indexed by immediate). */
17347static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
17348{
17349 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
17350 /* 00H (EQ_OQ) */ { false, false, true, false, false },
17351 /* 01H (LT_OS) */ { true, false, false, true, false },
17352 /* 02H (LE_OS) */ { true, false, true, true, false },
17353 /* 03H (UNORD_Q) */ { false, true, false, false, false },
17354 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
17355 /* 05H (NLT_US) */ { true, true, true, false, true },
17356 /* 06H (NLE_US) */ { true, true, false, false, true },
17357 /* 07H (ORQ_Q) */ { false, false, true, true, true },
17358 /** @todo AVX variants. */
17359};
17360
17361
17362static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
17363{
17364 bool fRes;
17365 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17366
17367 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
17368 {
17369 *pfMxcsr |= X86_MXCSR_IE;
17370 fRes = g_aCmpTbl[bEvil].fUnordered;
17371 }
17372 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
17373 {
17374 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17375 *pfMxcsr |= X86_MXCSR_IE;
17376 fRes = g_aCmpTbl[bEvil].fUnordered;
17377 }
17378 else
17379 {
17380 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17381
17382 RTFLOAT32U r32Src1, r32Src2;
17383 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
17384 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
17385
17386 *pfMxcsr |= fDe;
17387 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17388 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17389 if (f32_eq(f32Src1, f32Src2, &SoftState))
17390 fRes = g_aCmpTbl[bEvil].fEqual;
17391 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17392 fRes = g_aCmpTbl[bEvil].fLowerThan;
17393 else
17394 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17395 }
17396
17397 return fRes;
17398}
17399
17400
17401static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17402{
17403 bool fRes;
17404 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17405
17406 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17407 {
17408 *pfMxcsr |= X86_MXCSR_IE;
17409 fRes = g_aCmpTbl[bEvil].fUnordered;
17410 }
17411 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17412 {
17413 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17414 *pfMxcsr |= X86_MXCSR_IE;
17415 fRes = g_aCmpTbl[bEvil].fUnordered;
17416 }
17417 else
17418 {
17419 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17420
17421 RTFLOAT64U r64Src1, r64Src2;
17422 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
17423 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17424
17425 *pfMxcsr |= fDe;
17426 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17427 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17428 if (f64_eq(f64Src1, f64Src2, &SoftState))
17429 fRes = g_aCmpTbl[bEvil].fEqual;
17430 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17431 fRes = g_aCmpTbl[bEvil].fLowerThan;
17432 else
17433 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17434 }
17435
17436 return fRes;
17437}
17438
17439
17440IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17441{
17442 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17443 {
17444 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17445 puDst->au32[i] = UINT32_MAX;
17446 else
17447 puDst->au32[i] = 0;
17448 }
17449}
17450
17451
17452IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17453{
17454 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17455 {
17456 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17457 puDst->au64[i] = UINT64_MAX;
17458 else
17459 puDst->au64[i] = 0;
17460 }
17461}
17462
17463
17464IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17465{
17466 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17467 puDst->au32[0] = UINT32_MAX;
17468 else
17469 puDst->au32[0] = 0;
17470
17471 puDst->au32[1] = pSrc->uSrc1.au32[1];
17472 puDst->au64[1] = pSrc->uSrc1.au64[1];
17473}
17474
17475
17476IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17477{
17478 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17479 puDst->au64[0] = UINT64_MAX;
17480 else
17481 puDst->au64[0] = 0;
17482
17483 puDst->au64[1] = pSrc->uSrc1.au64[1];
17484}
17485#endif
17486
17487
17488/**
17489 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
17490 */
17491
17492#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
17493#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
17494#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
17495
17496#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
17497
17498DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
17499{
17500 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
17501 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17502
17503 fMxcsr &= ~X86_MXCSR_RC_MASK;
17504 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
17505 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17506}
17507
17508static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
17509{
17510 RTFLOAT32U r32Src, r32Dst;
17511 float32_t f32Src;
17512 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17513 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17514
17515 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
17516 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
17517
17518 iemFpSoftF32ToIprt(&r32Dst, f32Src);
17519 return r32Dst;
17520}
17521
17522static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
17523{
17524 RTFLOAT64U r64Src, r64Dst;
17525 float64_t f64Src;
17526 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17527 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17528
17529 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
17530 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
17531
17532 iemFpSoftF64ToIprt(&r64Dst, f64Src);
17533 return r64Dst;
17534}
17535
17536#ifdef IEM_WITHOUT_ASSEMBLY
17537IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17538{
17539 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17540 puDst->au32[1] = pSrc->uSrc1.au32[1];
17541 puDst->au64[1] = pSrc->uSrc1.au64[1];
17542}
17543
17544
17545IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17546{
17547 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17548 puDst->au64[1] = pSrc->uSrc1.au64[1];
17549}
17550#endif
17551
17552IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17553{
17554 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17555 {
17556 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17557 }
17558}
17559
17560
17561IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17562{
17563 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17564 {
17565 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17566 }
17567}
17568
17569/**
17570 * CVTPD2PI
17571 */
17572#ifdef IEM_WITHOUT_ASSEMBLY
17573static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17574{
17575 RTFLOAT64U r64Src;
17576 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17577
17578 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17579 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17580 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17581}
17582
17583
17584IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17585{
17586 RTUINT64U u64Res;
17587 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17588 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17589
17590 *pu64Dst = u64Res.u;
17591 *pfMxcsr = fMxcsrOut;
17592}
17593#endif
17594
17595
17596/**
17597 * CVTTPD2PI
17598 */
17599#ifdef IEM_WITHOUT_ASSEMBLY
17600static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17601{
17602 RTFLOAT64U r64Src;
17603 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17604
17605 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17606 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17607 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17608}
17609
17610
17611IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17612{
17613 RTUINT64U u64Res;
17614 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17615 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17616
17617 *pu64Dst = u64Res.u;
17618 *pfMxcsr = fMxcsrOut;
17619}
17620#endif
17621
17622
17623/**
17624 * CVTPI2PS
17625 */
17626#ifdef IEM_WITHOUT_ASSEMBLY
17627static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17628{
17629 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17630 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17631 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17632}
17633
17634
17635IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17636{
17637 RTUINT64U uSrc = { u64Src };
17638 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17639 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17640 *pfMxcsr = fMxcsrOut;
17641}
17642#endif
17643
17644
17645/**
17646 * CVTPI2PD
17647 */
17648#ifdef IEM_WITHOUT_ASSEMBLY
17649static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17650{
17651 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17652 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17653 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17654}
17655
17656
17657IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17658{
17659 RTUINT64U uSrc = { u64Src };
17660 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17661 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17662 *pfMxcsr = fMxcsrOut;
17663}
17664#endif
17665
17666
17667/**
17668 * CVTPS2PI
17669 */
17670#ifdef IEM_WITHOUT_ASSEMBLY
17671static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17672{
17673 RTFLOAT32U r32Src;
17674 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17675
17676 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17677 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17678 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17679}
17680
17681
17682IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17683{
17684 RTUINT64U uDst;
17685 RTUINT64U uSrc = { u64Src };
17686 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17687 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17688 *pu64Dst = uDst.u;
17689 *pfMxcsr = fMxcsrOut;
17690}
17691#endif
17692
17693
17694/**
17695 * CVTTPS2PI
17696 */
17697#ifdef IEM_WITHOUT_ASSEMBLY
17698static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17699{
17700 RTFLOAT32U r32Src;
17701 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17702
17703 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17704 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17705 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17706}
17707
17708
17709IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17710{
17711 RTUINT64U uDst;
17712 RTUINT64U uSrc = { u64Src };
17713 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17714 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17715 *pu64Dst = uDst.u;
17716 *pfMxcsr = fMxcsrOut;
17717}
17718#endif
17719
17720/**
17721 * RDRAND
17722 */
17723IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17724{
17725 *puDst = 0;
17726 *pEFlags &= ~X86_EFL_STATUS_BITS;
17727 *pEFlags |= X86_EFL_CF;
17728}
17729
17730IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17731{
17732 *puDst = 0;
17733 *pEFlags &= ~X86_EFL_STATUS_BITS;
17734 *pEFlags |= X86_EFL_CF;
17735}
17736
17737IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17738{
17739 *puDst = 0;
17740 *pEFlags &= ~X86_EFL_STATUS_BITS;
17741 *pEFlags |= X86_EFL_CF;
17742}
17743
17744/**
17745 * RDSEED
17746 */
17747IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17748{
17749 *puDst = 0;
17750 *pEFlags &= ~X86_EFL_STATUS_BITS;
17751 *pEFlags |= X86_EFL_CF;
17752}
17753
17754IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17755{
17756 *puDst = 0;
17757 *pEFlags &= ~X86_EFL_STATUS_BITS;
17758 *pEFlags |= X86_EFL_CF;
17759}
17760
17761IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17762{
17763 *puDst = 0;
17764 *pEFlags &= ~X86_EFL_STATUS_BITS;
17765 *pEFlags |= X86_EFL_CF;
17766}
17767
17768
17769/**
17770 * SHA1NEXTE
17771 */
17772IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17773{
17774 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
17775
17776 puDst->au32[0] = puSrc->au32[0];
17777 puDst->au32[1] = puSrc->au32[1];
17778 puDst->au32[2] = puSrc->au32[2];
17779 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
17780}
17781
17782/**
17783 * SHA1MSG1
17784 */
17785IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17786{
17787 uint32_t u32W0 = puDst->au32[3];
17788 uint32_t u32W1 = puDst->au32[2];
17789 uint32_t u32W2 = puDst->au32[1];
17790 uint32_t u32W3 = puDst->au32[0];
17791 uint32_t u32W4 = puSrc->au32[3];
17792 uint32_t u32W5 = puSrc->au32[2];
17793
17794 puDst->au32[3] = u32W2 ^ u32W0;
17795 puDst->au32[2] = u32W3 ^ u32W1;
17796 puDst->au32[1] = u32W4 ^ u32W2;
17797 puDst->au32[0] = u32W5 ^ u32W3;
17798}
17799
17800/**
17801 * SHA1MSG2
17802 */
17803IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17804{
17805 uint32_t u32W13 = puSrc->au32[2];
17806 uint32_t u32W14 = puSrc->au32[1];
17807 uint32_t u32W15 = puSrc->au32[0];
17808 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
17809 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
17810 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
17811 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
17812
17813 puDst->au32[3] = u32W16;
17814 puDst->au32[2] = u32W17;
17815 puDst->au32[1] = u32W18;
17816 puDst->au32[0] = u32W19;
17817}
17818
17819/**
17820 * SHA1RNDS4
17821 */
17822typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
17823typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
17824
17825static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D)
17826{
17827 return (u32B & u32C) ^ (~u32B & u32D);
17828}
17829
17830static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D)
17831{
17832 return u32B ^ u32C ^ u32D;
17833}
17834
17835static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D)
17836{
17837 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
17838}
17839
17840static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D)
17841{
17842 return u32B ^ u32C ^ u32D;
17843}
17844
17845IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17846{
17847 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
17848 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
17849
17850 uint32_t au32A[5];
17851 uint32_t au32B[5];
17852 uint32_t au32C[5];
17853 uint32_t au32D[5];
17854 uint32_t au32E[5];
17855 uint32_t au32W[4];
17856 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
17857 uint32_t u32K = s_au32K[bEvil & 0x3];
17858
17859 au32A[0] = puDst->au32[3];
17860 au32B[0] = puDst->au32[2];
17861 au32C[0] = puDst->au32[1];
17862 au32D[0] = puDst->au32[0];
17863 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
17864 au32W[i] = puSrc->au32[3 - i];
17865
17866 /* Round 0 is a bit different than the other rounds. */
17867 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
17868 au32B[1] = au32A[0];
17869 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
17870 au32D[1] = au32C[0];
17871 au32E[1] = au32D[0];
17872
17873 for (uint32_t i = 1; i <= 3; i++)
17874 {
17875 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
17876 au32B[i + 1] = au32A[i];
17877 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
17878 au32D[i + 1] = au32C[i];
17879 au32E[i + 1] = au32D[i];
17880 }
17881
17882 puDst->au32[3] = au32A[4];
17883 puDst->au32[2] = au32B[4];
17884 puDst->au32[1] = au32C[4];
17885 puDst->au32[0] = au32D[4];
17886}
17887
17888
17889/**
17890 * SHA256MSG1
17891 */
17892DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
17893{
17894 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
17895}
17896
17897IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17898{
17899 uint32_t u32W4 = puSrc->au32[0];
17900 uint32_t u32W3 = puDst->au32[3];
17901 uint32_t u32W2 = puDst->au32[2];
17902 uint32_t u32W1 = puDst->au32[1];
17903 uint32_t u32W0 = puDst->au32[0];
17904
17905 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
17906 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
17907 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
17908 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
17909}
17910
17911/**
17912 * SHA256MSG2
17913 */
17914DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
17915{
17916 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
17917}
17918
17919IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17920{
17921 uint32_t u32W14 = puSrc->au32[2];
17922 uint32_t u32W15 = puSrc->au32[3];
17923 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
17924 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
17925 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
17926 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
17927
17928 puDst->au32[3] = u32W19;
17929 puDst->au32[2] = u32W18;
17930 puDst->au32[1] = u32W17;
17931 puDst->au32[0] = u32W16;
17932}
17933
17934/**
17935 * SHA256RNDS2
17936 */
17937DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
17938{
17939 return (u32X & u32Y) ^ (~u32X & u32Z);
17940}
17941
17942DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
17943{
17944 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
17945}
17946
17947DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
17948{
17949 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
17950}
17951
17952DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
17953{
17954 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
17955}
17956
17957IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
17958{
17959 uint32_t au32A[3];
17960 uint32_t au32B[3];
17961 uint32_t au32C[3];
17962 uint32_t au32D[3];
17963 uint32_t au32E[3];
17964 uint32_t au32F[3];
17965 uint32_t au32G[3];
17966 uint32_t au32H[3];
17967 uint32_t au32WK[2];
17968
17969 au32A[0] = puSrc->au32[3];
17970 au32B[0] = puSrc->au32[2];
17971 au32C[0] = puDst->au32[3];
17972 au32D[0] = puDst->au32[2];
17973 au32E[0] = puSrc->au32[1];
17974 au32F[0] = puSrc->au32[0];
17975 au32G[0] = puDst->au32[1];
17976 au32H[0] = puDst->au32[0];
17977
17978 au32WK[0] = puXmm0Constants->au32[0];
17979 au32WK[1] = puXmm0Constants->au32[1];
17980
17981 for (uint32_t i = 0; i < 2; i++)
17982 {
17983 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
17984 + iemAImpl_sha256_upper_sigma1(au32E[i])
17985 + au32WK[i]
17986 + au32H[i]
17987 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
17988 + iemAImpl_sha256_upper_sigma0(au32A[i]);
17989 au32B[i + 1] = au32A[i];
17990 au32C[i + 1] = au32B[i];
17991 au32D[i + 1] = au32C[i];
17992 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
17993 + iemAImpl_sha256_upper_sigma1(au32E[i])
17994 + au32WK[i]
17995 + au32H[i]
17996 + au32D[i];
17997 au32F[i + 1] = au32E[i];
17998 au32G[i + 1] = au32F[i];
17999 au32H[i + 1] = au32G[i];
18000 }
18001
18002 puDst->au32[3] = au32A[2];
18003 puDst->au32[2] = au32B[2];
18004 puDst->au32[1] = au32E[2];
18005 puDst->au32[0] = au32F[2];
18006}
18007
18008
18009/**
18010 * ADCX
18011 */
18012#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18013 do \
18014 { \
18015 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18016 a_Type uTmp = *puDst + uSrc; \
18017 if (uTmp < uSrc) \
18018 *pfEFlags |= (a_Flag); \
18019 else \
18020 *pfEFlags &= ~(a_Flag); \
18021 if ( uTmp == a_Max \
18022 && f) \
18023 *pfEFlags |= (a_Flag); \
18024 if (f) \
18025 uTmp++; \
18026 *puDst = uTmp; \
18027 } \
18028 while (0)
18029
18030IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18031{
18032 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18033}
18034
18035IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18036{
18037 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18038}
18039
18040
18041/**
18042 * ADOX
18043 */
18044IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18045{
18046 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18047}
18048
18049IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18050{
18051 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18052}
18053
18054
18055/**
18056 * MPSADBW
18057 */
18058IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18059{
18060 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18061 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18062 int16_t ai16Src1[11];
18063 int16_t ai16Src2[4];
18064
18065 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18066 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18067
18068 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18069 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18070
18071 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18072 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18073 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18074 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18075 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18076}
18077
18078
18079/**
18080 * DPPS
18081 */
18082IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18083{
18084 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18085 AssertReleaseFailed();
18086}
18087
18088
18089/**
18090 * DPPD
18091 */
18092IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18093{
18094 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18095 AssertReleaseFailed();
18096}
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use