VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp

Last change on this file was 104439, checked in by vboxsync, 3 weeks ago

VMM/IEM: Implement maskmovq, [v]maskmovdqu instruction decoding, dispatch & emulation, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 730.0 KB
Line 
1/* $Id: IEMAllAImplC.cpp 104439 2024-04-26 10:30:18Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Calculates the parity flag.
87 *
88 * @returns X86_EFL_PF or 0.
89 * @param a_uResult Unsigned result value.
90 */
91#if !defined(RT_ARCH_ARM64) || 1 /** @todo profile this... micro benching in tstIEMAImpl indicates no gain, but it may be skewed. */
92# define IEM_EFL_CALC_PARITY(a_uResult) (g_afParity[(a_uResult) & 0xff])
93#else
94# define IEM_EFL_CALC_PARITY(a_uResult) iemAImplCalcParity(a_uResult)
95DECL_FORCE_INLINE(uint32_t) iemAImplCalcParity(uint32_t uResult)
96{
97 /* Emulate 8-bit pop count. This translates to 4 EOR instructions on
98 ARM64 as they can shift the 2nd source operand. */
99 uint8_t bPf = uResult ^ (uResult >> 4);
100 bPf ^= bPf >> 2;
101 bPf ^= bPf >> 1;
102 bPf ^= 1;
103 return (bPf & 1) << X86_EFL_PF_BIT;
104}
105#endif
106
107/**
108 * Extracts the OF flag from a OF calculation result.
109 *
110 * These are typically used by concating with a bitcount. The problem is that
111 * 8-bit values needs shifting in the other direction than the others.
112 */
113#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
114#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
115#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
116#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
117
118/**
119 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
120 *
121 * @returns Status bits.
122 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
123 * @param a_uResult Unsigned result value.
124 * @param a_uSrc The source value (for AF calc).
125 * @param a_uDst The original destination value (for AF+OF calc).
126 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
127 * @param a_CfExpr Bool expression for the carry flag (CF).
128 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
129 */
130#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_fEFlagsVar, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
131 do { \
132 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
133 a_fEFlagsVar |= (a_CfExpr) << X86_EFL_CF_BIT; \
134 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
135 a_fEFlagsVar |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
136 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
137 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
138 \
139 /* Overflow during ADDition happens when both inputs have the same signed \
140 bit value and the result has a different sign bit value. \
141 \
142 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
143 follows that for SUBtraction the signed bit value must differ between \
144 the two inputs and the result's signed bit diff from the first input. \
145 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
146 \
147 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
148 a_fEFlagsVar |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
149 & RT_BIT_64(a_cBitsWidth - 1)) \
150 & ((a_uResult) ^ (a_uDst)) ); \
151 } while (0)
152
153/**
154 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
155 *
156 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
157 * undefined. We clear AF, as that seems to make the most sense and also seems
158 * to be the correct behavior on current CPUs.
159 *
160 * @returns Status bits.
161 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
162 * @param a_uResult Unsigned result value.
163 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
164 * @param a_fExtra Additional bits to set.
165 */
166#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_fEFlagsVar, a_uResult, a_cBitsWidth, a_fExtra) \
167 do { \
168 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
169 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
170 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
171 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
172 a_fEFlagsVar |= (a_fExtra); \
173 } while (0)
174
175
176/*********************************************************************************************************************************
177* Global Variables *
178*********************************************************************************************************************************/
179/**
180 * Parity calculation table.
181 *
182 * This is also used by iemAllAImpl.asm.
183 *
184 * The generator code:
185 * @code
186 * #include <stdio.h>
187 *
188 * int main()
189 * {
190 * unsigned b;
191 * for (b = 0; b < 256; b++)
192 * {
193 * int cOnes = ( b & 1)
194 * + ((b >> 1) & 1)
195 * + ((b >> 2) & 1)
196 * + ((b >> 3) & 1)
197 * + ((b >> 4) & 1)
198 * + ((b >> 5) & 1)
199 * + ((b >> 6) & 1)
200 * + ((b >> 7) & 1);
201 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
202 * b,
203 * (b >> 7) & 1,
204 * (b >> 6) & 1,
205 * (b >> 5) & 1,
206 * (b >> 4) & 1,
207 * (b >> 3) & 1,
208 * (b >> 2) & 1,
209 * (b >> 1) & 1,
210 * b & 1,
211 * cOnes & 1 ? "0" : "X86_EFL_PF");
212 * }
213 * return 0;
214 * }
215 * @endcode
216 */
217uint8_t const g_afParity[256] =
218{
219 /* 0000 = 00000000b */ X86_EFL_PF,
220 /* 0x01 = 00000001b */ 0,
221 /* 0x02 = 00000010b */ 0,
222 /* 0x03 = 00000011b */ X86_EFL_PF,
223 /* 0x04 = 00000100b */ 0,
224 /* 0x05 = 00000101b */ X86_EFL_PF,
225 /* 0x06 = 00000110b */ X86_EFL_PF,
226 /* 0x07 = 00000111b */ 0,
227 /* 0x08 = 00001000b */ 0,
228 /* 0x09 = 00001001b */ X86_EFL_PF,
229 /* 0x0a = 00001010b */ X86_EFL_PF,
230 /* 0x0b = 00001011b */ 0,
231 /* 0x0c = 00001100b */ X86_EFL_PF,
232 /* 0x0d = 00001101b */ 0,
233 /* 0x0e = 00001110b */ 0,
234 /* 0x0f = 00001111b */ X86_EFL_PF,
235 /* 0x10 = 00010000b */ 0,
236 /* 0x11 = 00010001b */ X86_EFL_PF,
237 /* 0x12 = 00010010b */ X86_EFL_PF,
238 /* 0x13 = 00010011b */ 0,
239 /* 0x14 = 00010100b */ X86_EFL_PF,
240 /* 0x15 = 00010101b */ 0,
241 /* 0x16 = 00010110b */ 0,
242 /* 0x17 = 00010111b */ X86_EFL_PF,
243 /* 0x18 = 00011000b */ X86_EFL_PF,
244 /* 0x19 = 00011001b */ 0,
245 /* 0x1a = 00011010b */ 0,
246 /* 0x1b = 00011011b */ X86_EFL_PF,
247 /* 0x1c = 00011100b */ 0,
248 /* 0x1d = 00011101b */ X86_EFL_PF,
249 /* 0x1e = 00011110b */ X86_EFL_PF,
250 /* 0x1f = 00011111b */ 0,
251 /* 0x20 = 00100000b */ 0,
252 /* 0x21 = 00100001b */ X86_EFL_PF,
253 /* 0x22 = 00100010b */ X86_EFL_PF,
254 /* 0x23 = 00100011b */ 0,
255 /* 0x24 = 00100100b */ X86_EFL_PF,
256 /* 0x25 = 00100101b */ 0,
257 /* 0x26 = 00100110b */ 0,
258 /* 0x27 = 00100111b */ X86_EFL_PF,
259 /* 0x28 = 00101000b */ X86_EFL_PF,
260 /* 0x29 = 00101001b */ 0,
261 /* 0x2a = 00101010b */ 0,
262 /* 0x2b = 00101011b */ X86_EFL_PF,
263 /* 0x2c = 00101100b */ 0,
264 /* 0x2d = 00101101b */ X86_EFL_PF,
265 /* 0x2e = 00101110b */ X86_EFL_PF,
266 /* 0x2f = 00101111b */ 0,
267 /* 0x30 = 00110000b */ X86_EFL_PF,
268 /* 0x31 = 00110001b */ 0,
269 /* 0x32 = 00110010b */ 0,
270 /* 0x33 = 00110011b */ X86_EFL_PF,
271 /* 0x34 = 00110100b */ 0,
272 /* 0x35 = 00110101b */ X86_EFL_PF,
273 /* 0x36 = 00110110b */ X86_EFL_PF,
274 /* 0x37 = 00110111b */ 0,
275 /* 0x38 = 00111000b */ 0,
276 /* 0x39 = 00111001b */ X86_EFL_PF,
277 /* 0x3a = 00111010b */ X86_EFL_PF,
278 /* 0x3b = 00111011b */ 0,
279 /* 0x3c = 00111100b */ X86_EFL_PF,
280 /* 0x3d = 00111101b */ 0,
281 /* 0x3e = 00111110b */ 0,
282 /* 0x3f = 00111111b */ X86_EFL_PF,
283 /* 0x40 = 01000000b */ 0,
284 /* 0x41 = 01000001b */ X86_EFL_PF,
285 /* 0x42 = 01000010b */ X86_EFL_PF,
286 /* 0x43 = 01000011b */ 0,
287 /* 0x44 = 01000100b */ X86_EFL_PF,
288 /* 0x45 = 01000101b */ 0,
289 /* 0x46 = 01000110b */ 0,
290 /* 0x47 = 01000111b */ X86_EFL_PF,
291 /* 0x48 = 01001000b */ X86_EFL_PF,
292 /* 0x49 = 01001001b */ 0,
293 /* 0x4a = 01001010b */ 0,
294 /* 0x4b = 01001011b */ X86_EFL_PF,
295 /* 0x4c = 01001100b */ 0,
296 /* 0x4d = 01001101b */ X86_EFL_PF,
297 /* 0x4e = 01001110b */ X86_EFL_PF,
298 /* 0x4f = 01001111b */ 0,
299 /* 0x50 = 01010000b */ X86_EFL_PF,
300 /* 0x51 = 01010001b */ 0,
301 /* 0x52 = 01010010b */ 0,
302 /* 0x53 = 01010011b */ X86_EFL_PF,
303 /* 0x54 = 01010100b */ 0,
304 /* 0x55 = 01010101b */ X86_EFL_PF,
305 /* 0x56 = 01010110b */ X86_EFL_PF,
306 /* 0x57 = 01010111b */ 0,
307 /* 0x58 = 01011000b */ 0,
308 /* 0x59 = 01011001b */ X86_EFL_PF,
309 /* 0x5a = 01011010b */ X86_EFL_PF,
310 /* 0x5b = 01011011b */ 0,
311 /* 0x5c = 01011100b */ X86_EFL_PF,
312 /* 0x5d = 01011101b */ 0,
313 /* 0x5e = 01011110b */ 0,
314 /* 0x5f = 01011111b */ X86_EFL_PF,
315 /* 0x60 = 01100000b */ X86_EFL_PF,
316 /* 0x61 = 01100001b */ 0,
317 /* 0x62 = 01100010b */ 0,
318 /* 0x63 = 01100011b */ X86_EFL_PF,
319 /* 0x64 = 01100100b */ 0,
320 /* 0x65 = 01100101b */ X86_EFL_PF,
321 /* 0x66 = 01100110b */ X86_EFL_PF,
322 /* 0x67 = 01100111b */ 0,
323 /* 0x68 = 01101000b */ 0,
324 /* 0x69 = 01101001b */ X86_EFL_PF,
325 /* 0x6a = 01101010b */ X86_EFL_PF,
326 /* 0x6b = 01101011b */ 0,
327 /* 0x6c = 01101100b */ X86_EFL_PF,
328 /* 0x6d = 01101101b */ 0,
329 /* 0x6e = 01101110b */ 0,
330 /* 0x6f = 01101111b */ X86_EFL_PF,
331 /* 0x70 = 01110000b */ 0,
332 /* 0x71 = 01110001b */ X86_EFL_PF,
333 /* 0x72 = 01110010b */ X86_EFL_PF,
334 /* 0x73 = 01110011b */ 0,
335 /* 0x74 = 01110100b */ X86_EFL_PF,
336 /* 0x75 = 01110101b */ 0,
337 /* 0x76 = 01110110b */ 0,
338 /* 0x77 = 01110111b */ X86_EFL_PF,
339 /* 0x78 = 01111000b */ X86_EFL_PF,
340 /* 0x79 = 01111001b */ 0,
341 /* 0x7a = 01111010b */ 0,
342 /* 0x7b = 01111011b */ X86_EFL_PF,
343 /* 0x7c = 01111100b */ 0,
344 /* 0x7d = 01111101b */ X86_EFL_PF,
345 /* 0x7e = 01111110b */ X86_EFL_PF,
346 /* 0x7f = 01111111b */ 0,
347 /* 0x80 = 10000000b */ 0,
348 /* 0x81 = 10000001b */ X86_EFL_PF,
349 /* 0x82 = 10000010b */ X86_EFL_PF,
350 /* 0x83 = 10000011b */ 0,
351 /* 0x84 = 10000100b */ X86_EFL_PF,
352 /* 0x85 = 10000101b */ 0,
353 /* 0x86 = 10000110b */ 0,
354 /* 0x87 = 10000111b */ X86_EFL_PF,
355 /* 0x88 = 10001000b */ X86_EFL_PF,
356 /* 0x89 = 10001001b */ 0,
357 /* 0x8a = 10001010b */ 0,
358 /* 0x8b = 10001011b */ X86_EFL_PF,
359 /* 0x8c = 10001100b */ 0,
360 /* 0x8d = 10001101b */ X86_EFL_PF,
361 /* 0x8e = 10001110b */ X86_EFL_PF,
362 /* 0x8f = 10001111b */ 0,
363 /* 0x90 = 10010000b */ X86_EFL_PF,
364 /* 0x91 = 10010001b */ 0,
365 /* 0x92 = 10010010b */ 0,
366 /* 0x93 = 10010011b */ X86_EFL_PF,
367 /* 0x94 = 10010100b */ 0,
368 /* 0x95 = 10010101b */ X86_EFL_PF,
369 /* 0x96 = 10010110b */ X86_EFL_PF,
370 /* 0x97 = 10010111b */ 0,
371 /* 0x98 = 10011000b */ 0,
372 /* 0x99 = 10011001b */ X86_EFL_PF,
373 /* 0x9a = 10011010b */ X86_EFL_PF,
374 /* 0x9b = 10011011b */ 0,
375 /* 0x9c = 10011100b */ X86_EFL_PF,
376 /* 0x9d = 10011101b */ 0,
377 /* 0x9e = 10011110b */ 0,
378 /* 0x9f = 10011111b */ X86_EFL_PF,
379 /* 0xa0 = 10100000b */ X86_EFL_PF,
380 /* 0xa1 = 10100001b */ 0,
381 /* 0xa2 = 10100010b */ 0,
382 /* 0xa3 = 10100011b */ X86_EFL_PF,
383 /* 0xa4 = 10100100b */ 0,
384 /* 0xa5 = 10100101b */ X86_EFL_PF,
385 /* 0xa6 = 10100110b */ X86_EFL_PF,
386 /* 0xa7 = 10100111b */ 0,
387 /* 0xa8 = 10101000b */ 0,
388 /* 0xa9 = 10101001b */ X86_EFL_PF,
389 /* 0xaa = 10101010b */ X86_EFL_PF,
390 /* 0xab = 10101011b */ 0,
391 /* 0xac = 10101100b */ X86_EFL_PF,
392 /* 0xad = 10101101b */ 0,
393 /* 0xae = 10101110b */ 0,
394 /* 0xaf = 10101111b */ X86_EFL_PF,
395 /* 0xb0 = 10110000b */ 0,
396 /* 0xb1 = 10110001b */ X86_EFL_PF,
397 /* 0xb2 = 10110010b */ X86_EFL_PF,
398 /* 0xb3 = 10110011b */ 0,
399 /* 0xb4 = 10110100b */ X86_EFL_PF,
400 /* 0xb5 = 10110101b */ 0,
401 /* 0xb6 = 10110110b */ 0,
402 /* 0xb7 = 10110111b */ X86_EFL_PF,
403 /* 0xb8 = 10111000b */ X86_EFL_PF,
404 /* 0xb9 = 10111001b */ 0,
405 /* 0xba = 10111010b */ 0,
406 /* 0xbb = 10111011b */ X86_EFL_PF,
407 /* 0xbc = 10111100b */ 0,
408 /* 0xbd = 10111101b */ X86_EFL_PF,
409 /* 0xbe = 10111110b */ X86_EFL_PF,
410 /* 0xbf = 10111111b */ 0,
411 /* 0xc0 = 11000000b */ X86_EFL_PF,
412 /* 0xc1 = 11000001b */ 0,
413 /* 0xc2 = 11000010b */ 0,
414 /* 0xc3 = 11000011b */ X86_EFL_PF,
415 /* 0xc4 = 11000100b */ 0,
416 /* 0xc5 = 11000101b */ X86_EFL_PF,
417 /* 0xc6 = 11000110b */ X86_EFL_PF,
418 /* 0xc7 = 11000111b */ 0,
419 /* 0xc8 = 11001000b */ 0,
420 /* 0xc9 = 11001001b */ X86_EFL_PF,
421 /* 0xca = 11001010b */ X86_EFL_PF,
422 /* 0xcb = 11001011b */ 0,
423 /* 0xcc = 11001100b */ X86_EFL_PF,
424 /* 0xcd = 11001101b */ 0,
425 /* 0xce = 11001110b */ 0,
426 /* 0xcf = 11001111b */ X86_EFL_PF,
427 /* 0xd0 = 11010000b */ 0,
428 /* 0xd1 = 11010001b */ X86_EFL_PF,
429 /* 0xd2 = 11010010b */ X86_EFL_PF,
430 /* 0xd3 = 11010011b */ 0,
431 /* 0xd4 = 11010100b */ X86_EFL_PF,
432 /* 0xd5 = 11010101b */ 0,
433 /* 0xd6 = 11010110b */ 0,
434 /* 0xd7 = 11010111b */ X86_EFL_PF,
435 /* 0xd8 = 11011000b */ X86_EFL_PF,
436 /* 0xd9 = 11011001b */ 0,
437 /* 0xda = 11011010b */ 0,
438 /* 0xdb = 11011011b */ X86_EFL_PF,
439 /* 0xdc = 11011100b */ 0,
440 /* 0xdd = 11011101b */ X86_EFL_PF,
441 /* 0xde = 11011110b */ X86_EFL_PF,
442 /* 0xdf = 11011111b */ 0,
443 /* 0xe0 = 11100000b */ 0,
444 /* 0xe1 = 11100001b */ X86_EFL_PF,
445 /* 0xe2 = 11100010b */ X86_EFL_PF,
446 /* 0xe3 = 11100011b */ 0,
447 /* 0xe4 = 11100100b */ X86_EFL_PF,
448 /* 0xe5 = 11100101b */ 0,
449 /* 0xe6 = 11100110b */ 0,
450 /* 0xe7 = 11100111b */ X86_EFL_PF,
451 /* 0xe8 = 11101000b */ X86_EFL_PF,
452 /* 0xe9 = 11101001b */ 0,
453 /* 0xea = 11101010b */ 0,
454 /* 0xeb = 11101011b */ X86_EFL_PF,
455 /* 0xec = 11101100b */ 0,
456 /* 0xed = 11101101b */ X86_EFL_PF,
457 /* 0xee = 11101110b */ X86_EFL_PF,
458 /* 0xef = 11101111b */ 0,
459 /* 0xf0 = 11110000b */ X86_EFL_PF,
460 /* 0xf1 = 11110001b */ 0,
461 /* 0xf2 = 11110010b */ 0,
462 /* 0xf3 = 11110011b */ X86_EFL_PF,
463 /* 0xf4 = 11110100b */ 0,
464 /* 0xf5 = 11110101b */ X86_EFL_PF,
465 /* 0xf6 = 11110110b */ X86_EFL_PF,
466 /* 0xf7 = 11110111b */ 0,
467 /* 0xf8 = 11111000b */ 0,
468 /* 0xf9 = 11111001b */ X86_EFL_PF,
469 /* 0xfa = 11111010b */ X86_EFL_PF,
470 /* 0xfb = 11111011b */ 0,
471 /* 0xfc = 11111100b */ X86_EFL_PF,
472 /* 0xfd = 11111101b */ 0,
473 /* 0xfe = 11111110b */ 0,
474 /* 0xff = 11111111b */ X86_EFL_PF,
475};
476
477/* for clang: */
478extern const RTFLOAT32U g_ar32Zero[];
479extern const RTFLOAT64U g_ar64Zero[];
480extern const RTFLOAT80U g_ar80Zero[];
481extern const RTFLOAT32U g_ar32One[];
482extern const RTFLOAT80U g_ar80One[];
483extern const RTFLOAT80U g_r80Indefinite;
484extern const RTFLOAT32U g_ar32Infinity[];
485extern const RTFLOAT64U g_ar64Infinity[];
486extern const RTFLOAT80U g_ar80Infinity[];
487extern const RTFLOAT128U g_r128Ln2;
488extern const RTUINT128U g_u128Ln2Mantissa;
489extern const RTUINT128U g_u128Ln2MantissaIntel;
490extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
491extern const RTFLOAT32U g_ar32QNaN[];
492extern const RTFLOAT64U g_ar64QNaN[];
493
494/** Zero values (indexed by fSign). */
495RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
496RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
497RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
498
499/** One values (indexed by fSign). */
500RTFLOAT32U const g_ar32One[] =
501{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
502RTFLOAT80U const g_ar80One[] =
503{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
504
505/** Indefinite (negative). */
506RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
507
508/** Infinities (indexed by fSign). */
509RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
510RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
511RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
512
513/** Default QNaNs (indexed by fSign). */
514RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
515RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
516
517
518#if 0
519/** 128-bit floating point constant: 2.0 */
520const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
521#endif
522
523
524/* The next section is generated by tools/IEMGenFpuConstants: */
525
526/** The ln2 constant as 128-bit floating point value.
527 * base-10: 6.93147180559945309417232121458176575e-1
528 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
529 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
530//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
531const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
532/** High precision ln2 value.
533 * base-10: 6.931471805599453094172321214581765680747e-1
534 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
535 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
536const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
537/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
538 * base-10: 6.931471805599453094151379470289064954613e-1
539 * base-16: b.17217f7d1cf79abc0000000000000000@-1
540 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
541const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
542
543/** Horner constants for f2xm1 */
544const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
545{
546 /* a0
547 * base-10: 1.00000000000000000000000000000000000e0
548 * base-16: 1.0000000000000000000000000000@0
549 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
550 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
551 /* a1
552 * base-10: 5.00000000000000000000000000000000000e-1
553 * base-16: 8.0000000000000000000000000000@-1
554 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
555 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
556 /* a2
557 * base-10: 1.66666666666666666666666666666666658e-1
558 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
559 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
560 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
561 /* a3
562 * base-10: 4.16666666666666666666666666666666646e-2
563 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
564 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
565 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
566 /* a4
567 * base-10: 8.33333333333333333333333333333333323e-3
568 * base-16: 2.2222222222222222222222222222@-2
569 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
570 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
571 /* a5
572 * base-10: 1.38888888888888888888888888888888874e-3
573 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
574 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
575 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
576 /* a6
577 * base-10: 1.98412698412698412698412698412698412e-4
578 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
579 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
580 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
581 /* a7
582 * base-10: 2.48015873015873015873015873015873015e-5
583 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
584 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
585 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
586 /* a8
587 * base-10: 2.75573192239858906525573192239858902e-6
588 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
589 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
590 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
591 /* a9
592 * base-10: 2.75573192239858906525573192239858865e-7
593 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
594 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
595 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
596 /* a10
597 * base-10: 2.50521083854417187750521083854417184e-8
598 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
599 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
600 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
601 /* a11
602 * base-10: 2.08767569878680989792100903212014296e-9
603 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
604 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
605 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
606 /* a12
607 * base-10: 1.60590438368216145993923771701549472e-10
608 * base-16: b.092309d43684be51c198e91d7b40@-9
609 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
610 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
611 /* a13
612 * base-10: 1.14707455977297247138516979786821043e-11
613 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
614 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
615 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
616 /* a14
617 * base-10: 7.64716373181981647590113198578806964e-13
618 * base-16: d.73f9f399dc0f88ec32b587746578@-11
619 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
620 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
621 /* a15
622 * base-10: 4.77947733238738529743820749111754352e-14
623 * base-16: d.73f9f399dc0f88ec32b587746578@-12
624 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
625 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
626 /* a16
627 * base-10: 2.81145725434552076319894558301031970e-15
628 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
629 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
630 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
631 /* a17
632 * base-10: 1.56192069685862264622163643500573321e-16
633 * base-16: b.413c31dcbecbbdd8024435161550@-14
634 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
635 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
636 /* a18
637 * base-10: 8.22063524662432971695598123687227980e-18
638 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
639 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
640 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
641 /* a19
642 * base-10: 4.11031762331216485847799061843614006e-19
643 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
644 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
645 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
646 /* a20
647 * base-10: 1.95729410633912612308475743735054143e-20
648 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
649 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
650 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
651 /* a21
652 * base-10: 8.89679139245057328674889744250246106e-22
653 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
654 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
655 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
656};
657
658
659/*
660 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
661 * it all in C is probably safer atm., optimize what's necessary later, maybe.
662 */
663#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
664
665
666/*********************************************************************************************************************************
667* Binary Operations *
668*********************************************************************************************************************************/
669
670/*
671 * ADD
672 */
673
674IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
675{
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
680 return fEFlags;
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
686{
687 uint32_t uDst = *puDst;
688 uint32_t uResult = uDst + uSrc;
689 *puDst = uResult;
690 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
691 return fEFlags;
692}
693
694
695IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
696{
697 uint16_t uDst = *puDst;
698 uint16_t uResult = uDst + uSrc;
699 *puDst = uResult;
700 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
701 return fEFlags;
702}
703
704
705IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
706{
707 uint8_t uDst = *puDst;
708 uint8_t uResult = uDst + uSrc;
709 *puDst = uResult;
710 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
711 return fEFlags;
712}
713
714# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
715
716/*
717 * ADC
718 */
719
720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
721{
722 if (!(fEFlags & X86_EFL_CF))
723 fEFlags = iemAImpl_add_u64(fEFlags, puDst, uSrc);
724 else
725 {
726 uint64_t uDst = *puDst;
727 uint64_t uResult = uDst + uSrc + 1;
728 *puDst = uResult;
729 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
730 }
731 return fEFlags;
732}
733
734# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
735
736IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
737{
738 if (!(fEFlags & X86_EFL_CF))
739 fEFlags = iemAImpl_add_u32(fEFlags, puDst, uSrc);
740 else
741 {
742 uint32_t uDst = *puDst;
743 uint32_t uResult = uDst + uSrc + 1;
744 *puDst = uResult;
745 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
746 }
747 return fEFlags;
748}
749
750
751IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
752{
753 if (!(fEFlags & X86_EFL_CF))
754 fEFlags = iemAImpl_add_u16(fEFlags, puDst, uSrc);
755 else
756 {
757 uint16_t uDst = *puDst;
758 uint16_t uResult = uDst + uSrc + 1;
759 *puDst = uResult;
760 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
761 }
762 return fEFlags;
763}
764
765
766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
767{
768 if (!(fEFlags & X86_EFL_CF))
769 fEFlags = iemAImpl_add_u8(fEFlags, puDst, uSrc);
770 else
771 {
772 uint8_t uDst = *puDst;
773 uint8_t uResult = uDst + uSrc + 1;
774 *puDst = uResult;
775 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
776 }
777 return fEFlags;
778}
779
780# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
781
782/*
783 * SUB
784 */
785# if !defined(RT_ARCH_ARM64)
786
787IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
788{
789 uint64_t uDst = *puDst;
790 uint64_t uResult = uDst - uSrc;
791 *puDst = uResult;
792 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
793 return fEFlags;
794}
795
796# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
797
798IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
799{
800 uint32_t uDst = *puDst;
801 uint32_t uResult = uDst - uSrc;
802 *puDst = uResult;
803 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
804 return fEFlags;
805}
806
807
808IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
809{
810 uint16_t uDst = *puDst;
811 uint16_t uResult = uDst - uSrc;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
814 return fEFlags;
815}
816
817
818IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
819{
820 uint8_t uDst = *puDst;
821 uint8_t uResult = uDst - uSrc;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
824 return fEFlags;
825}
826
827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
828# endif /* !RT_ARCH_ARM64 */
829
830/*
831 * SBB
832 */
833
834IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
835{
836 if (!(fEFlags & X86_EFL_CF))
837 fEFlags = iemAImpl_sub_u64(fEFlags, puDst, uSrc);
838 else
839 {
840 uint64_t uDst = *puDst;
841 uint64_t uResult = uDst - uSrc - 1;
842 *puDst = uResult;
843 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
844 }
845 return fEFlags;
846}
847
848# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
849
850IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
851{
852 if (!(fEFlags & X86_EFL_CF))
853 fEFlags = iemAImpl_sub_u32(fEFlags, puDst, uSrc);
854 else
855 {
856 uint32_t uDst = *puDst;
857 uint32_t uResult = uDst - uSrc - 1;
858 *puDst = uResult;
859 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
860 }
861 return fEFlags;
862}
863
864
865IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
866{
867 if (!(fEFlags & X86_EFL_CF))
868 fEFlags = iemAImpl_sub_u16(fEFlags, puDst, uSrc);
869 else
870 {
871 uint16_t uDst = *puDst;
872 uint16_t uResult = uDst - uSrc - 1;
873 *puDst = uResult;
874 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
875 }
876 return fEFlags;
877}
878
879
880IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
881{
882 if (!(fEFlags & X86_EFL_CF))
883 fEFlags = iemAImpl_sub_u8(fEFlags, puDst, uSrc);
884 else
885 {
886 uint8_t uDst = *puDst;
887 uint8_t uResult = uDst - uSrc - 1;
888 *puDst = uResult;
889 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
890 }
891 return fEFlags;
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896
897/*
898 * OR
899 */
900
901IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
902{
903 uint64_t uResult = *puDst | uSrc;
904 *puDst = uResult;
905 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
906 return fEFlags;
907}
908
909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
910
911IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
912{
913 uint32_t uResult = *puDst | uSrc;
914 *puDst = uResult;
915 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
916 return fEFlags;
917}
918
919
920IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
921{
922 uint16_t uResult = *puDst | uSrc;
923 *puDst = uResult;
924 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
925 return fEFlags;
926}
927
928
929IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
930{
931 uint8_t uResult = *puDst | uSrc;
932 *puDst = uResult;
933 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
934 return fEFlags;
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * XOR
941 */
942
943IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
944{
945 uint64_t uResult = *puDst ^ uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
948 return fEFlags;
949}
950
951# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
952
953IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
954{
955 uint32_t uResult = *puDst ^ uSrc;
956 *puDst = uResult;
957 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
958 return fEFlags;
959}
960
961
962IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
963{
964 uint16_t uResult = *puDst ^ uSrc;
965 *puDst = uResult;
966 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
967 return fEFlags;
968}
969
970
971IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
972{
973 uint8_t uResult = *puDst ^ uSrc;
974 *puDst = uResult;
975 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
976 return fEFlags;
977}
978
979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
980
981/*
982 * AND
983 */
984
985IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
986{
987 uint64_t const uResult = *puDst & uSrc;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
990 return fEFlags;
991}
992
993# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994
995IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
996{
997 uint32_t const uResult = *puDst & uSrc;
998 *puDst = uResult;
999 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1000 return fEFlags;
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1005{
1006 uint16_t const uResult = *puDst & uSrc;
1007 *puDst = uResult;
1008 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1009 return fEFlags;
1010}
1011
1012
1013IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
1014{
1015 uint8_t const uResult = *puDst & uSrc;
1016 *puDst = uResult;
1017 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1018 return fEFlags;
1019}
1020
1021# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1022#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1023
1024/*
1025 * ANDN (BMI1 instruction)
1026 */
1027
1028IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1029{
1030 uint64_t const uResult = ~uSrc1 & uSrc2;
1031 *puDst = uResult;
1032 uint32_t fEFlags = *pfEFlags;
1033 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1034 *pfEFlags = fEFlags;
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1039{
1040 uint32_t const uResult = ~uSrc1 & uSrc2;
1041 *puDst = uResult;
1042 uint32_t fEFlags = *pfEFlags;
1043 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1044 *pfEFlags = fEFlags;
1045}
1046
1047
1048#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1049IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1050{
1051 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1052}
1053#endif
1054
1055
1056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1057IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1058{
1059 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1060}
1061#endif
1062
1063#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1064
1065/*
1066 * CMP
1067 */
1068
1069IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1070{
1071 uint64_t uDstTmp = *puDst;
1072 return iemAImpl_sub_u64(fEFlags, &uDstTmp, uSrc);
1073}
1074
1075# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1076
1077IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1078{
1079 uint32_t uDstTmp = *puDst;
1080 return iemAImpl_sub_u32(fEFlags, &uDstTmp, uSrc);
1081}
1082
1083
1084IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1085{
1086 uint16_t uDstTmp = *puDst;
1087 return iemAImpl_sub_u16(fEFlags, &uDstTmp, uSrc);
1088}
1089
1090
1091IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1092{
1093 uint8_t uDstTmp = *puDst;
1094 return iemAImpl_sub_u8(fEFlags, &uDstTmp, uSrc);
1095}
1096
1097# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1098
1099/*
1100 * TEST
1101 */
1102
1103IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1104{
1105 uint64_t uResult = *puDst & uSrc;
1106 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1107 return fEFlags;
1108}
1109
1110# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1111
1112IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1113{
1114 uint32_t uResult = *puDst & uSrc;
1115 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1116 return fEFlags;
1117}
1118
1119
1120IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1121{
1122 uint16_t uResult = *puDst & uSrc;
1123 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1124 return fEFlags;
1125}
1126
1127
1128IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1129{
1130 uint8_t uResult = *puDst & uSrc;
1131 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1132 return fEFlags;
1133}
1134
1135# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1136
1137
1138/*
1139 * LOCK prefixed variants of the above
1140 */
1141
1142/** 64-bit locked binary operand operation. */
1143# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1144 do { \
1145 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1146 uint ## a_cBitsWidth ## _t uTmp; \
1147 uint32_t fEflTmp; \
1148 do \
1149 { \
1150 uTmp = uOld; \
1151 fEflTmp = iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(fEFlagsIn, &uTmp, uSrc); \
1152 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1153 return fEflTmp; \
1154 } while (0)
1155
1156
1157#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1158 IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint32_t fEFlagsIn, \
1159 uint ## a_cBitsWidth ## _t *puDst, \
1160 uint ## a_cBitsWidth ## _t uSrc)) \
1161 { \
1162 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1163 }
1164
1165EMIT_LOCKED_BIN_OP(add, 64)
1166EMIT_LOCKED_BIN_OP(adc, 64)
1167EMIT_LOCKED_BIN_OP(sub, 64)
1168EMIT_LOCKED_BIN_OP(sbb, 64)
1169EMIT_LOCKED_BIN_OP(or, 64)
1170EMIT_LOCKED_BIN_OP(xor, 64)
1171EMIT_LOCKED_BIN_OP(and, 64)
1172# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1173EMIT_LOCKED_BIN_OP(add, 32)
1174EMIT_LOCKED_BIN_OP(adc, 32)
1175EMIT_LOCKED_BIN_OP(sub, 32)
1176EMIT_LOCKED_BIN_OP(sbb, 32)
1177EMIT_LOCKED_BIN_OP(or, 32)
1178EMIT_LOCKED_BIN_OP(xor, 32)
1179EMIT_LOCKED_BIN_OP(and, 32)
1180
1181EMIT_LOCKED_BIN_OP(add, 16)
1182EMIT_LOCKED_BIN_OP(adc, 16)
1183EMIT_LOCKED_BIN_OP(sub, 16)
1184EMIT_LOCKED_BIN_OP(sbb, 16)
1185EMIT_LOCKED_BIN_OP(or, 16)
1186EMIT_LOCKED_BIN_OP(xor, 16)
1187EMIT_LOCKED_BIN_OP(and, 16)
1188
1189EMIT_LOCKED_BIN_OP(add, 8)
1190EMIT_LOCKED_BIN_OP(adc, 8)
1191EMIT_LOCKED_BIN_OP(sub, 8)
1192EMIT_LOCKED_BIN_OP(sbb, 8)
1193EMIT_LOCKED_BIN_OP(or, 8)
1194EMIT_LOCKED_BIN_OP(xor, 8)
1195EMIT_LOCKED_BIN_OP(and, 8)
1196# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1197
1198
1199/*
1200 * Bit operations (same signature as above).
1201 */
1202
1203/*
1204 * BT
1205 */
1206
1207IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1208{
1209 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1210 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1211 Assert(uSrc < 64);
1212 uint64_t uDst = *puDst;
1213 if (uDst & RT_BIT_64(uSrc))
1214 fEFlags |= X86_EFL_CF;
1215 else
1216 fEFlags &= ~X86_EFL_CF;
1217 return fEFlags;
1218}
1219
1220# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1221
1222IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1223{
1224 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1225 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1226 Assert(uSrc < 32);
1227 uint32_t uDst = *puDst;
1228 if (uDst & RT_BIT_32(uSrc))
1229 fEFlags |= X86_EFL_CF;
1230 else
1231 fEFlags &= ~X86_EFL_CF;
1232 return fEFlags;
1233}
1234
1235IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1236{
1237 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1238 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1239 Assert(uSrc < 16);
1240 uint16_t uDst = *puDst;
1241 if (uDst & RT_BIT_32(uSrc))
1242 fEFlags |= X86_EFL_CF;
1243 else
1244 fEFlags &= ~X86_EFL_CF;
1245 return fEFlags;
1246}
1247
1248# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1249
1250/*
1251 * BTC
1252 */
1253
1254IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1255{
1256 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1257 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1258 Assert(uSrc < 64);
1259 uint64_t fMask = RT_BIT_64(uSrc);
1260 uint64_t uDst = *puDst;
1261 if (uDst & fMask)
1262 {
1263 uDst &= ~fMask;
1264 *puDst = uDst;
1265 fEFlags |= X86_EFL_CF;
1266 }
1267 else
1268 {
1269 uDst |= fMask;
1270 *puDst = uDst;
1271 fEFlags &= ~X86_EFL_CF;
1272 }
1273 return fEFlags;
1274}
1275
1276# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1277
1278IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1279{
1280 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1281 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1282 Assert(uSrc < 32);
1283 uint32_t fMask = RT_BIT_32(uSrc);
1284 uint32_t uDst = *puDst;
1285 if (uDst & fMask)
1286 {
1287 uDst &= ~fMask;
1288 *puDst = uDst;
1289 fEFlags |= X86_EFL_CF;
1290 }
1291 else
1292 {
1293 uDst |= fMask;
1294 *puDst = uDst;
1295 fEFlags &= ~X86_EFL_CF;
1296 }
1297 return fEFlags;
1298}
1299
1300
1301IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1302{
1303 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1304 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1305 Assert(uSrc < 16);
1306 uint16_t fMask = RT_BIT_32(uSrc);
1307 uint16_t uDst = *puDst;
1308 if (uDst & fMask)
1309 {
1310 uDst &= ~fMask;
1311 *puDst = uDst;
1312 fEFlags |= X86_EFL_CF;
1313 }
1314 else
1315 {
1316 uDst |= fMask;
1317 *puDst = uDst;
1318 fEFlags &= ~X86_EFL_CF;
1319 }
1320 return fEFlags;
1321}
1322
1323# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1324
1325/*
1326 * BTR
1327 */
1328
1329IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1330{
1331 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1332 logical operation (AND/OR/whatever). */
1333 Assert(uSrc < 64);
1334 uint64_t fMask = RT_BIT_64(uSrc);
1335 uint64_t uDst = *puDst;
1336 if (uDst & fMask)
1337 {
1338 uDst &= ~fMask;
1339 *puDst = uDst;
1340 fEFlags |= X86_EFL_CF;
1341 }
1342 else
1343 fEFlags &= ~X86_EFL_CF;
1344 return fEFlags;
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 {
1358 uDst &= ~fMask;
1359 *puDst = uDst;
1360 fEFlags |= X86_EFL_CF;
1361 }
1362 else
1363 fEFlags &= ~X86_EFL_CF;
1364 return fEFlags;
1365}
1366
1367
1368IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1369{
1370 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1371 logical operation (AND/OR/whatever). */
1372 Assert(uSrc < 16);
1373 uint16_t fMask = RT_BIT_32(uSrc);
1374 uint16_t uDst = *puDst;
1375 if (uDst & fMask)
1376 {
1377 uDst &= ~fMask;
1378 *puDst = uDst;
1379 fEFlags |= X86_EFL_CF;
1380 }
1381 else
1382 fEFlags &= ~X86_EFL_CF;
1383 return fEFlags;
1384}
1385
1386# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1387
1388/*
1389 * BTS
1390 */
1391
1392IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1393{
1394 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1395 logical operation (AND/OR/whatever). */
1396 Assert(uSrc < 64);
1397 uint64_t fMask = RT_BIT_64(uSrc);
1398 uint64_t uDst = *puDst;
1399 if (uDst & fMask)
1400 fEFlags |= X86_EFL_CF;
1401 else
1402 {
1403 uDst |= fMask;
1404 *puDst = uDst;
1405 fEFlags &= ~X86_EFL_CF;
1406 }
1407 return fEFlags;
1408}
1409
1410# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1411
1412IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1413{
1414 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1415 logical operation (AND/OR/whatever). */
1416 Assert(uSrc < 32);
1417 uint32_t fMask = RT_BIT_32(uSrc);
1418 uint32_t uDst = *puDst;
1419 if (uDst & fMask)
1420 fEFlags |= X86_EFL_CF;
1421 else
1422 {
1423 uDst |= fMask;
1424 *puDst = uDst;
1425 fEFlags &= ~X86_EFL_CF;
1426 }
1427 return fEFlags;
1428}
1429
1430
1431IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1432{
1433 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1434 logical operation (AND/OR/whatever). */
1435 Assert(uSrc < 16);
1436 uint16_t fMask = RT_BIT_32(uSrc);
1437 uint32_t uDst = *puDst;
1438 if (uDst & fMask)
1439 fEFlags |= X86_EFL_CF;
1440 else
1441 {
1442 uDst |= fMask;
1443 *puDst = uDst;
1444 fEFlags &= ~X86_EFL_CF;
1445 }
1446 return fEFlags;
1447}
1448
1449# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1450
1451EMIT_LOCKED_BIN_OP(btc, 64)
1452EMIT_LOCKED_BIN_OP(btr, 64)
1453EMIT_LOCKED_BIN_OP(bts, 64)
1454# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1455EMIT_LOCKED_BIN_OP(btc, 32)
1456EMIT_LOCKED_BIN_OP(btr, 32)
1457EMIT_LOCKED_BIN_OP(bts, 32)
1458
1459EMIT_LOCKED_BIN_OP(btc, 16)
1460EMIT_LOCKED_BIN_OP(btr, 16)
1461EMIT_LOCKED_BIN_OP(bts, 16)
1462# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1463
1464#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1465
1466/*
1467 * Helpers for BSR and BSF.
1468 *
1469 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1470 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1471 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1472 * but we restrict ourselves to emulating these recent marchs.
1473 */
1474#define SET_BIT_SEARCH_RESULT_INTEL(a_puDst, a_fEFlagsVar, a_iBit) do { \
1475 unsigned iBit = (a_iBit); \
1476 a_fEFlagsVar &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1477 if (iBit) \
1478 { \
1479 *(a_puDst) = --iBit; \
1480 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(iBit); \
1481 } \
1482 else \
1483 a_fEFlagsVar |= X86_EFL_ZF | X86_EFL_PF; \
1484 } while (0)
1485#define SET_BIT_SEARCH_RESULT_AMD(a_puDst, a_fEFlagsVar, a_iBit) do { \
1486 unsigned const iBit = (a_iBit); \
1487 if (iBit) \
1488 { \
1489 *(a_puDst) = iBit - 1; \
1490 a_fEFlagsVar &= ~X86_EFL_ZF; \
1491 } \
1492 else \
1493 a_fEFlagsVar |= X86_EFL_ZF; \
1494 } while (0)
1495
1496/*
1497 * BSF - first (least significant) bit set
1498 */
1499#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1500IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1501{
1502 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1503 return fEFlags;
1504}
1505#endif
1506
1507IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1510 return fEFlags;
1511}
1512
1513IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1514{
1515 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1516 return fEFlags;
1517}
1518
1519#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1520IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1521{
1522 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1523 return fEFlags;
1524}
1525#endif
1526
1527IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1528{
1529 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1530 return fEFlags;
1531}
1532
1533IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1534{
1535 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1536 return fEFlags;
1537}
1538
1539
1540#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1541IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1542{
1543 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1544 return fEFlags;
1545}
1546#endif
1547
1548IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1549{
1550 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1551 return fEFlags;
1552}
1553
1554IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1555{
1556 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1557 return fEFlags;
1558}
1559
1560
1561
1562/*
1563 * BSR - last (most significant) bit set
1564 */
1565#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1566IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1567{
1568 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1569 return fEFlags;
1570}
1571#endif
1572
1573IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1574{
1575 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1576 return fEFlags;
1577}
1578
1579IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1580{
1581 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1582 return fEFlags;
1583}
1584
1585
1586#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1587IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1588{
1589 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1590 return fEFlags;
1591}
1592#endif
1593
1594IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1595{
1596 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1597 return fEFlags;
1598}
1599
1600IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1601{
1602 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1603 return fEFlags;
1604}
1605
1606
1607#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1608IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1609{
1610 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1611 return fEFlags;
1612}
1613#endif
1614
1615IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1616{
1617 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1618 return fEFlags;
1619}
1620
1621IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1622{
1623 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1624 return fEFlags;
1625}
1626
1627
1628/*
1629 * Helpers for LZCNT and TZCNT.
1630 */
1631#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_fEFlagsVar, a_uResult) do { \
1632 unsigned const uResult = (a_uResult); \
1633 *(a_puDst) = uResult; \
1634 a_fEFlagsVar &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1635 if (uResult) \
1636 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(uResult); \
1637 else \
1638 a_fEFlagsVar |= X86_EFL_ZF | X86_EFL_PF; \
1639 if (!a_uSrc) \
1640 a_fEFlagsVar |= X86_EFL_CF; \
1641 } while (0)
1642#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_fEFlagsVar, a_uResult) do { \
1643 unsigned const uResult = (a_uResult); \
1644 *(a_puDst) = uResult; \
1645 a_fEFlagsVar &= ~(X86_EFL_ZF | X86_EFL_CF); \
1646 if (!uResult) \
1647 a_fEFlagsVar |= X86_EFL_ZF; \
1648 if (!a_uSrc) \
1649 a_fEFlagsVar |= X86_EFL_CF; \
1650 } while (0)
1651
1652
1653/*
1654 * LZCNT - count leading zero bits.
1655 */
1656#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1657IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1658{
1659 return iemAImpl_lzcnt_u64_intel(fEFlags, puDst, uSrc);
1660}
1661#endif
1662
1663IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU64(uSrc));
1666 return fEFlags;
1667}
1668
1669IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1670{
1671 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU64(uSrc));
1672 return fEFlags;
1673}
1674
1675
1676#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1677IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1678{
1679 return iemAImpl_lzcnt_u32_intel(fEFlags, puDst, uSrc);
1680}
1681#endif
1682
1683IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1684{
1685 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU32(uSrc));
1686 return fEFlags;
1687}
1688
1689IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1690{
1691 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU32(uSrc));
1692 return fEFlags;
1693}
1694
1695
1696#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1697IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1698{
1699 return iemAImpl_lzcnt_u16_intel(fEFlags, puDst, uSrc);
1700}
1701#endif
1702
1703IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1704{
1705 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU16(uSrc));
1706 return fEFlags;
1707}
1708
1709IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1710{
1711 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU16(uSrc));
1712 return fEFlags;
1713}
1714
1715
1716/*
1717 * TZCNT - count leading zero bits.
1718 */
1719#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1721{
1722 return iemAImpl_tzcnt_u64_intel(fEFlags, puDst, uSrc);
1723}
1724#endif
1725
1726IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1727{
1728 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU64(uSrc));
1729 return fEFlags;
1730}
1731
1732IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1733{
1734 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU64(uSrc));
1735 return fEFlags;
1736}
1737
1738
1739#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1740IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1741{
1742 return iemAImpl_tzcnt_u32_intel(fEFlags, puDst, uSrc);
1743}
1744#endif
1745
1746IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1747{
1748 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU32(uSrc));
1749 return fEFlags;
1750}
1751
1752IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1753{
1754 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU32(uSrc));
1755 return fEFlags;
1756}
1757
1758
1759#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1760IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1761{
1762 return iemAImpl_tzcnt_u16_intel(fEFlags, puDst, uSrc);
1763}
1764#endif
1765
1766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1767{
1768 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU16(uSrc));
1769 return fEFlags;
1770}
1771
1772IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1773{
1774 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU16(uSrc));
1775 return fEFlags;
1776}
1777
1778
1779
1780/*
1781 * BEXTR (BMI1 instruction)
1782 */
1783#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1784IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1785 a_Type uSrc2, uint32_t *pfEFlags)) \
1786{ \
1787 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1788 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1789 a_Type uResult; \
1790 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1791 if (iFirstBit < a_cBits) \
1792 { \
1793 uResult = uSrc1 >> iFirstBit; \
1794 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1795 if (cBits < a_cBits) \
1796 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1797 *puDst = uResult; \
1798 if (!uResult) \
1799 fEfl |= X86_EFL_ZF; \
1800 } \
1801 else \
1802 { \
1803 *puDst = uResult = 0; \
1804 fEfl |= X86_EFL_ZF; \
1805 } \
1806 /** @todo complete flag calculations. */ \
1807 *pfEFlags = fEfl; \
1808}
1809
1810EMIT_BEXTR(64, uint64_t, _fallback)
1811EMIT_BEXTR(32, uint32_t, _fallback)
1812#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1813EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1814#endif
1815#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1816EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1817#endif
1818
1819/*
1820 * BLSR (BMI1 instruction)
1821 */
1822#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1823IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1824{ \
1825 *puDst = uSrc; \
1826 uint32_t fEfl1 = iemAImpl_sub_u ## a_cBits(fEFlags, &uSrc, 1); \
1827 uint32_t fEfl2 = iemAImpl_and_u ## a_cBits(fEFlags, puDst, uSrc); \
1828 \
1829 /* AMD: The carry flag is from the SUB operation. */ \
1830 /* 10890xe: PF always cleared? */ \
1831 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1832 fEfl2 |= fEfl1 & X86_EFL_CF; \
1833 return fEfl2; \
1834}
1835
1836EMIT_BLSR(64, uint64_t, _fallback)
1837EMIT_BLSR(32, uint32_t, _fallback)
1838#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1839EMIT_BLSR(64, uint64_t, RT_NOTHING)
1840#endif
1841#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1842EMIT_BLSR(32, uint32_t, RT_NOTHING)
1843#endif
1844
1845/*
1846 * BLSMSK (BMI1 instruction)
1847 */
1848#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1849IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1850{ \
1851 *puDst = uSrc; \
1852 uint32_t fEfl1 = iemAImpl_sub_u ## a_cBits(fEFlags, &uSrc, 1); \
1853 uint32_t fEfl2 = iemAImpl_xor_u ## a_cBits(fEFlags, puDst, uSrc); \
1854 \
1855 /* AMD: The carry flag is from the SUB operation. */ \
1856 /* 10890xe: PF always cleared? */ \
1857 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1858 fEfl2 |= fEfl1 & X86_EFL_CF; \
1859 return fEfl2; \
1860}
1861
1862EMIT_BLSMSK(64, uint64_t, _fallback)
1863EMIT_BLSMSK(32, uint32_t, _fallback)
1864#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1865EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1866#endif
1867#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1868EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1869#endif
1870
1871/*
1872 * BLSI (BMI1 instruction)
1873 */
1874#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1875IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1876{ \
1877 uint32_t fEfl1 = fEFlags; \
1878 *puDst = uSrc; \
1879 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1880 uint32_t fEfl2 = iemAImpl_and_u ## a_cBits(fEFlags, puDst, uSrc); \
1881 \
1882 /* AMD: The carry flag is from the SUB operation. */ \
1883 /* 10890xe: PF always cleared? */ \
1884 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1885 fEfl2 |= fEfl1 & X86_EFL_CF; \
1886 return fEfl2; \
1887}
1888
1889EMIT_BLSI(64, uint64_t, _fallback)
1890EMIT_BLSI(32, uint32_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_BLSI(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_BLSI(32, uint32_t, RT_NOTHING)
1896#endif
1897
1898/*
1899 * BZHI (BMI2 instruction)
1900 */
1901#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1902IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1903 a_Type uSrc2, uint32_t *pfEFlags)) \
1904{ \
1905 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1906 a_Type uResult; \
1907 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1908 if (iFirstBit < a_cBits) \
1909 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1910 else \
1911 { \
1912 uResult = uSrc1; \
1913 fEfl |= X86_EFL_CF; \
1914 } \
1915 *puDst = uResult; \
1916 fEfl |= X86_EFL_CALC_ZF(uResult); \
1917 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1918 *pfEFlags = fEfl; \
1919}
1920
1921EMIT_BZHI(64, uint64_t, _fallback)
1922EMIT_BZHI(32, uint32_t, _fallback)
1923#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924EMIT_BZHI(64, uint64_t, RT_NOTHING)
1925#endif
1926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1927EMIT_BZHI(32, uint32_t, RT_NOTHING)
1928#endif
1929
1930/*
1931 * POPCNT
1932 */
1933RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1934{
1935 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1936 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1937 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1938 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1939};
1940
1941/** @todo Use native popcount where possible and employ some more efficient
1942 * algorithm here (or in asm.h fallback)! */
1943
1944DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1945{
1946 return g_abBitCounts6[ u16 & 0x3f]
1947 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1948 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1949}
1950
1951DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1952{
1953 return g_abBitCounts6[ u32 & 0x3f]
1954 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1955 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1956 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1957 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1958 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1959}
1960
1961DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1962{
1963 return g_abBitCounts6[ u64 & 0x3f]
1964 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1965 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1966 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1967 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1968 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1969 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1970 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1971 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1972 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1973 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1974}
1975
1976#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1977IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1978{ \
1979 fEFlags &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1980 a_Type uResult; \
1981 if (uSrc) \
1982 uResult = iemPopCountU ## a_cBits(uSrc); \
1983 else \
1984 { \
1985 fEFlags |= X86_EFL_ZF; \
1986 uResult = 0; \
1987 } \
1988 *puDst = uResult; \
1989 return fEFlags; \
1990}
1991
1992EMIT_POPCNT(64, uint64_t, _fallback)
1993EMIT_POPCNT(32, uint32_t, _fallback)
1994EMIT_POPCNT(16, uint16_t, _fallback)
1995#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1996EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1997#endif
1998#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1999EMIT_POPCNT(32, uint32_t, RT_NOTHING)
2000EMIT_POPCNT(16, uint16_t, RT_NOTHING)
2001#endif
2002
2003
2004#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2005
2006/*
2007 * XCHG
2008 */
2009
2010IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
2011{
2012#if ARCH_BITS >= 64
2013 *puReg = ASMAtomicXchgU64(puMem, *puReg);
2014#else
2015 uint64_t uOldMem = *puMem;
2016 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
2017 ASMNopPause();
2018 *puReg = uOldMem;
2019#endif
2020}
2021
2022# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2023
2024IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
2025{
2026 *puReg = ASMAtomicXchgU32(puMem, *puReg);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
2031{
2032 *puReg = ASMAtomicXchgU16(puMem, *puReg);
2033}
2034
2035
2036IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
2037{
2038 *puReg = ASMAtomicXchgU8(puMem, *puReg);
2039}
2040
2041# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2042
2043
2044/* Unlocked variants for fDisregardLock mode: */
2045
2046IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
2047{
2048 uint64_t const uOld = *puMem;
2049 *puMem = *puReg;
2050 *puReg = uOld;
2051}
2052
2053# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2054
2055IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
2056{
2057 uint32_t const uOld = *puMem;
2058 *puMem = *puReg;
2059 *puReg = uOld;
2060}
2061
2062
2063IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
2064{
2065 uint16_t const uOld = *puMem;
2066 *puMem = *puReg;
2067 *puReg = uOld;
2068}
2069
2070
2071IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
2072{
2073 uint8_t const uOld = *puMem;
2074 *puMem = *puReg;
2075 *puReg = uOld;
2076}
2077
2078# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2079
2080
2081/*
2082 * XADD and LOCK XADD.
2083 */
2084#define EMIT_XADD(a_cBitsWidth, a_Type) \
2085IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2086{ \
2087 a_Type uDst = *puDst; \
2088 a_Type uResult = uDst; \
2089 *pfEFlags = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2090 *puDst = uResult; \
2091 *puReg = uDst; \
2092} \
2093\
2094IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2095{ \
2096 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2097 a_Type uResult; \
2098 uint32_t fEflTmp; \
2099 do \
2100 { \
2101 uResult = uOld; \
2102 fEflTmp = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2103 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2104 *puReg = uOld; \
2105 *pfEFlags = fEflTmp; \
2106}
2107EMIT_XADD(64, uint64_t)
2108# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2109EMIT_XADD(32, uint32_t)
2110EMIT_XADD(16, uint16_t)
2111EMIT_XADD(8, uint8_t)
2112# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2113
2114#endif
2115
2116/*
2117 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2118 *
2119 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2120 * instructions are emulated as locked.
2121 */
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2125{
2126 uint8_t uOld = *puAl;
2127 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2128 Assert(*puAl == uOld);
2129 *pEFlags = iemAImpl_cmp_u8(*pEFlags, &uOld, *puAl);
2130}
2131
2132
2133IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2134{
2135 uint16_t uOld = *puAx;
2136 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2137 Assert(*puAx == uOld);
2138 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, *puAx);
2139}
2140
2141
2142IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2143{
2144 uint32_t uOld = *puEax;
2145 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2146 Assert(*puEax == uOld);
2147 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, *puEax);
2148}
2149
2150
2151# if ARCH_BITS == 32
2152IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2153# else
2154IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2155# endif
2156{
2157# if ARCH_BITS == 32
2158 uint64_t const uSrcReg = *puSrcReg;
2159# endif
2160 uint64_t uOld = *puRax;
2161 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2162 Assert(*puRax == uOld);
2163 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, *puRax);
2164}
2165
2166
2167IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2168 uint32_t *pEFlags))
2169{
2170 uint64_t const uNew = pu64EbxEcx->u;
2171 uint64_t const uOld = pu64EaxEdx->u;
2172 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2173 {
2174 Assert(pu64EaxEdx->u == uOld);
2175 *pEFlags |= X86_EFL_ZF;
2176 }
2177 else
2178 *pEFlags &= ~X86_EFL_ZF;
2179}
2180
2181
2182# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2183IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2184 uint32_t *pEFlags))
2185{
2186# ifdef VBOX_STRICT
2187 RTUINT128U const uOld = *pu128RaxRdx;
2188# endif
2189# if defined(RT_ARCH_AMD64)
2190 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2191 &pu128RaxRdx->u))
2192# else
2193 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2194# endif
2195 {
2196 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2197 *pEFlags |= X86_EFL_ZF;
2198 }
2199 else
2200 *pEFlags &= ~X86_EFL_ZF;
2201}
2202# endif
2203
2204#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2205
2206# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2207IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2208 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2209{
2210 RTUINT128U u128Tmp = *pu128Dst;
2211 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2212 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2213 {
2214 *pu128Dst = *pu128RbxRcx;
2215 *pEFlags |= X86_EFL_ZF;
2216 }
2217 else
2218 {
2219 *pu128RaxRdx = u128Tmp;
2220 *pEFlags &= ~X86_EFL_ZF;
2221 }
2222}
2223#endif /* !RT_ARCH_ARM64 */
2224
2225#if defined(IEM_WITHOUT_ASSEMBLY)
2226
2227/* Unlocked versions mapped to the locked ones: */
2228
2229IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2230{
2231 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2232}
2233
2234
2235IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2236{
2237# if 0
2238 /* If correctly aligned, used the locked variation. */
2239 if (!((uintptr_t)pu16Dst & 1))
2240 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2241 else
2242# endif
2243 {
2244 /* Otherwise emulate it as best as we can. */
2245 uint16_t const uOld = *puAx;
2246 uint16_t const uDst = *pu16Dst;
2247 if (uOld == uDst)
2248 {
2249 *pu16Dst = uSrcReg;
2250 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uOld);
2251 }
2252 else
2253 {
2254 *puAx = uDst;
2255 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uDst);
2256 }
2257 }
2258}
2259
2260
2261IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2262{
2263# if 0
2264 /* If correctly aligned, used the locked variation. */
2265 if (!((uintptr_t)pu32Dst & 3))
2266 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2267 else
2268# endif
2269 {
2270 /* Otherwise emulate it as best as we can. */
2271 uint32_t const uOld = *puEax;
2272 uint32_t const uDst = *pu32Dst;
2273 if (uOld == uDst)
2274 {
2275 *pu32Dst = uSrcReg;
2276 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uOld);
2277 }
2278 else
2279 {
2280 *puEax = uDst;
2281 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uDst);
2282 }
2283 }
2284}
2285
2286
2287# if ARCH_BITS == 32
2288IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2289{
2290# if 0
2291 /* If correctly aligned, used the locked variation. */
2292 if (!((uintptr_t)pu32Dst & 7))
2293 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2294 else
2295# endif
2296 {
2297 /* Otherwise emulate it as best as we can. */
2298 uint64_t const uOld = *puRax;
2299 uint64_t const uSrc = *puSrcReg;
2300 uint64_t const uDst = *pu64Dst;
2301 if (uOld == uDst)
2302 {
2303 *pu64Dst = uSrc;
2304 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2305 }
2306 else
2307 {
2308 *puRax = uDst;
2309 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2310 }
2311 }
2312}
2313# else /* ARCH_BITS != 32 */
2314IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2315{
2316# if 0
2317 /* If correctly aligned, used the locked variation. */
2318 if (!((uintptr_t)pu64Dst & 7))
2319 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2320 else
2321# endif
2322 {
2323 /* Otherwise emulate it as best as we can. */
2324 uint64_t const uOld = *puRax;
2325 uint64_t const uDst = *pu64Dst;
2326 if (uOld == uDst)
2327 {
2328 *pu64Dst = uSrcReg;
2329 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2330 }
2331 else
2332 {
2333 *puRax = uDst;
2334 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2335 }
2336 }
2337}
2338# endif /* ARCH_BITS != 32 */
2339
2340
2341IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2342{
2343# if 0
2344 /* If correctly aligned, used the locked variation. */
2345 if (!((uintptr_t)pu64Dst & 7))
2346 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2347 else
2348# endif
2349 {
2350 /* Otherwise emulate it as best as we can. */
2351 uint64_t const uNew = pu64EbxEcx->u;
2352 uint64_t const uOld = pu64EaxEdx->u;
2353 uint64_t const uDst = *pu64Dst;
2354 if (uDst == uOld)
2355 {
2356 *pu64Dst = uNew;
2357 *pEFlags |= X86_EFL_ZF;
2358 }
2359 else
2360 {
2361 pu64EaxEdx->u = uDst;
2362 *pEFlags &= ~X86_EFL_ZF;
2363 }
2364 }
2365}
2366
2367
2368IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2369 uint32_t *pEFlags))
2370{
2371# if 0
2372 /* If correctly aligned, used the locked variation. */
2373 if (!((uintptr_t)pu64Dst & 15))
2374 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2375 else
2376# endif
2377 {
2378 /* Otherwise emulate it as best as we can. */
2379# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2380 uint128_t const uNew = pu128RbxRcx->u;
2381 uint128_t const uOld = pu128RaxRdx->u;
2382 uint128_t const uDst = pu128Dst->u;
2383 if (uDst == uOld)
2384 {
2385 pu128Dst->u = uNew;
2386 *pEFlags |= X86_EFL_ZF;
2387 }
2388 else
2389 {
2390 pu128RaxRdx->u = uDst;
2391 *pEFlags &= ~X86_EFL_ZF;
2392 }
2393# else
2394 RTUINT128U const uNew = *pu128RbxRcx;
2395 RTUINT128U const uOld = *pu128RaxRdx;
2396 RTUINT128U const uDst = *pu128Dst;
2397 if ( uDst.s.Lo == uOld.s.Lo
2398 && uDst.s.Hi == uOld.s.Hi)
2399 {
2400 *pu128Dst = uNew;
2401 *pEFlags |= X86_EFL_ZF;
2402 }
2403 else
2404 {
2405 *pu128RaxRdx = uDst;
2406 *pEFlags &= ~X86_EFL_ZF;
2407 }
2408# endif
2409 }
2410}
2411
2412#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2413
2414#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2415 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2416
2417/*
2418 * MUL, IMUL, DIV and IDIV helpers.
2419 *
2420 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2421 * division step so we can select between using C operators and
2422 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2423 *
2424 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2425 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2426 * input loads and the result storing.
2427 */
2428
2429DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2430{
2431# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2432 pQuotient->s.Lo = 0;
2433 pQuotient->s.Hi = 0;
2434# endif
2435 RTUINT128U Divisor;
2436 Divisor.s.Lo = u64Divisor;
2437 Divisor.s.Hi = 0;
2438 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2439}
2440
2441# define DIV_LOAD(a_Dividend) \
2442 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2443# define DIV_LOAD_U8(a_Dividend) \
2444 a_Dividend.u = *puAX
2445
2446# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2447# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2448
2449# define MUL_LOAD_F1() *puA
2450# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2451
2452# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2453# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2454
2455# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2456 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2457# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2458 RTUInt128AssignNeg(&(a_Value))
2459
2460# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2461 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2462# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2463 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2464
2465# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2466 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2467 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2468# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2469 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2470
2471
2472/*
2473 * MUL
2474 */
2475# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2476IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2477{ \
2478 RTUINT ## a_cBitsWidth2x ## U Result; \
2479 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2480 a_fnStore(Result); \
2481 \
2482 /* Calc EFLAGS: */ \
2483 uint32_t fEfl = *pfEFlags; \
2484 if (a_fIntelFlags) \
2485 { /* Intel: 6700K and 10980XE behavior */ \
2486 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2487 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2488 fEfl |= X86_EFL_SF; \
2489 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo); \
2490 if (Result.s.Hi != 0) \
2491 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2492 } \
2493 else \
2494 { /* AMD: 3990X */ \
2495 if (Result.s.Hi != 0) \
2496 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2497 else \
2498 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2499 } \
2500 *pfEFlags = fEfl; \
2501 return 0; \
2502} \
2503
2504# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2505 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2506 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2507 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2508
2509# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2510EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2511 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2512# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2513EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2514 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2515EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2516 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2517EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2518 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2519# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2520# endif /* !DOXYGEN_RUNNING */
2521
2522/*
2523 * MULX
2524 */
2525# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2526IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2527 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2528{ \
2529 RTUINT ## a_cBitsWidth2x ## U Result; \
2530 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2531 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2532 *puDst1 = Result.s.Hi; \
2533} \
2534
2535# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2536EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2537EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2538# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2539EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2540EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2541# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2542# endif /* !DOXYGEN_RUNNING */
2543
2544
2545/*
2546 * IMUL
2547 *
2548 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2549 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2550 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2551 */
2552# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2553 a_Suffix, a_fIntelFlags) \
2554IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2555{ \
2556 RTUINT ## a_cBitsWidth2x ## U Result; \
2557 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2558 \
2559 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2560 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2561 { \
2562 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2563 { \
2564 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2565 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2566 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2567 } \
2568 else \
2569 { \
2570 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2571 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2572 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2573 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2574 a_fnNeg(Result, a_cBitsWidth2x); \
2575 } \
2576 } \
2577 else \
2578 { \
2579 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2580 { \
2581 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2582 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2583 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2584 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2585 a_fnNeg(Result, a_cBitsWidth2x); \
2586 } \
2587 else \
2588 { \
2589 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2590 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2591 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2592 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2593 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2594 } \
2595 } \
2596 a_fnStore(Result); \
2597 \
2598 if (a_fIntelFlags) \
2599 { \
2600 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2601 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2602 fEfl |= X86_EFL_SF; \
2603 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo & 0xff); \
2604 } \
2605 *pfEFlags = fEfl; \
2606 return 0; \
2607}
2608# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2609 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2610 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2611 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2612
2613# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2614EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2615 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2616# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2617EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2618 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2619EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2620 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2621EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2622 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2623# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2624# endif /* !DOXYGEN_RUNNING */
2625
2626
2627/*
2628 * IMUL with two operands are mapped onto the three operand variant, ignoring
2629 * the high part of the product.
2630 */
2631# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2632IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2633{ \
2634 a_uType uIgn; \
2635 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, &fEFlags); \
2636 return fEFlags; \
2637} \
2638\
2639IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits ## _intel,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2640{ \
2641 a_uType uIgn; \
2642 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, &fEFlags); \
2643 return fEFlags; \
2644} \
2645\
2646IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits ## _amd,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2647{ \
2648 a_uType uIgn; \
2649 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, &fEFlags); \
2650 return fEFlags; \
2651}
2652
2653EMIT_IMUL_TWO(64, uint64_t)
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655EMIT_IMUL_TWO(32, uint32_t)
2656EMIT_IMUL_TWO(16, uint16_t)
2657# endif
2658
2659
2660/*
2661 * DIV
2662 */
2663# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2664 a_Suffix, a_fIntelFlags) \
2665IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2666{ \
2667 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2668 a_fnLoad(Dividend); \
2669 if ( uDivisor != 0 \
2670 && Dividend.s.Hi < uDivisor) \
2671 { \
2672 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2673 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2674 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2675 \
2676 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2677 if (!a_fIntelFlags) \
2678 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2679 return 0; \
2680 } \
2681 /* #DE */ \
2682 return -1; \
2683}
2684# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2685 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2686 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2687 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2688
2689# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2690EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2691 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2692# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2693EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2694 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2695EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2696 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2697EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2698 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2699# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2700# endif /* !DOXYGEN_RUNNING */
2701
2702
2703/*
2704 * IDIV
2705 *
2706 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2707 * set AF and clear PF, ZF and SF just like it does for DIV.
2708 *
2709 */
2710# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2711 a_Suffix, a_fIntelFlags) \
2712IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2713{ \
2714 /* Note! Skylake leaves all flags alone. */ \
2715 \
2716 /** @todo overflow checks */ \
2717 if (uDivisor != 0) \
2718 { \
2719 /* \
2720 * Convert to unsigned division. \
2721 */ \
2722 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2723 a_fnLoad(Dividend); \
2724 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2725 if (fSignedDividend) \
2726 a_fnNeg(Dividend, a_cBitsWidth2x); \
2727 \
2728 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2729 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2730 uDivisorPositive = uDivisor; \
2731 else \
2732 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2733 \
2734 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2735 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2736 \
2737 /* \
2738 * Setup the result, checking for overflows. \
2739 */ \
2740 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2741 { \
2742 if (!fSignedDividend) \
2743 { \
2744 /* Positive divisor, positive dividend => result positive. */ \
2745 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2746 { \
2747 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2748 if (!a_fIntelFlags) \
2749 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2750 return 0; \
2751 } \
2752 } \
2753 else \
2754 { \
2755 /* Positive divisor, negative dividend => result negative. */ \
2756 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2757 { \
2758 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2759 if (!a_fIntelFlags) \
2760 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2761 return 0; \
2762 } \
2763 } \
2764 } \
2765 else \
2766 { \
2767 if (!fSignedDividend) \
2768 { \
2769 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2770 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2771 { \
2772 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2773 if (!a_fIntelFlags) \
2774 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2775 return 0; \
2776 } \
2777 } \
2778 else \
2779 { \
2780 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2781 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2782 { \
2783 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2784 if (!a_fIntelFlags) \
2785 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2786 return 0; \
2787 } \
2788 } \
2789 } \
2790 } \
2791 /* #DE */ \
2792 return -1; \
2793}
2794# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2795 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2796 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2797 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2798
2799# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2800EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2801 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2802# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2803EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2804 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2805EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2806 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2807EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2808 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2809# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2810# endif /* !DOXYGEN_RUNNING */
2811
2812#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2813
2814
2815/*********************************************************************************************************************************
2816* Unary operations. *
2817*********************************************************************************************************************************/
2818#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2819
2820/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2821 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2822 *
2823 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2824 * borrowing in arithmetic loops on intel 8008).
2825 *
2826 * @returns Status bits.
2827 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2828 * @param a_uResult Unsigned result value.
2829 * @param a_uDst The original destination value (for AF calc).
2830 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2831 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2832 */
2833#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2834 do { \
2835 uint32_t fEflTmp = *(a_pfEFlags); \
2836 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2837 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2838 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2839 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2840 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2841 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2842 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2843 *(a_pfEFlags) = fEflTmp; \
2844 } while (0)
2845
2846/*
2847 * INC
2848 */
2849
2850IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2851{
2852 uint64_t uDst = *puDst;
2853 uint64_t uResult = uDst + 1;
2854 *puDst = uResult;
2855 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2856}
2857
2858# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2859
2860IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2861{
2862 uint32_t uDst = *puDst;
2863 uint32_t uResult = uDst + 1;
2864 *puDst = uResult;
2865 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2866}
2867
2868
2869IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2870{
2871 uint16_t uDst = *puDst;
2872 uint16_t uResult = uDst + 1;
2873 *puDst = uResult;
2874 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2875}
2876
2877IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2878{
2879 uint8_t uDst = *puDst;
2880 uint8_t uResult = uDst + 1;
2881 *puDst = uResult;
2882 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2883}
2884
2885# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2886
2887
2888/*
2889 * DEC
2890 */
2891
2892IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2893{
2894 uint64_t uDst = *puDst;
2895 uint64_t uResult = uDst - 1;
2896 *puDst = uResult;
2897 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2898}
2899
2900# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2901
2902IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2903{
2904 uint32_t uDst = *puDst;
2905 uint32_t uResult = uDst - 1;
2906 *puDst = uResult;
2907 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2908}
2909
2910
2911IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2912{
2913 uint16_t uDst = *puDst;
2914 uint16_t uResult = uDst - 1;
2915 *puDst = uResult;
2916 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2917}
2918
2919
2920IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2921{
2922 uint8_t uDst = *puDst;
2923 uint8_t uResult = uDst - 1;
2924 *puDst = uResult;
2925 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2926}
2927
2928# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2929
2930
2931/*
2932 * NOT
2933 */
2934
2935IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2936{
2937 uint64_t uDst = *puDst;
2938 uint64_t uResult = ~uDst;
2939 *puDst = uResult;
2940 /* EFLAGS are not modified. */
2941 RT_NOREF_PV(pfEFlags);
2942}
2943
2944# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2945
2946IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2947{
2948 uint32_t uDst = *puDst;
2949 uint32_t uResult = ~uDst;
2950 *puDst = uResult;
2951 /* EFLAGS are not modified. */
2952 RT_NOREF_PV(pfEFlags);
2953}
2954
2955IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2956{
2957 uint16_t uDst = *puDst;
2958 uint16_t uResult = ~uDst;
2959 *puDst = uResult;
2960 /* EFLAGS are not modified. */
2961 RT_NOREF_PV(pfEFlags);
2962}
2963
2964IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2965{
2966 uint8_t uDst = *puDst;
2967 uint8_t uResult = ~uDst;
2968 *puDst = uResult;
2969 /* EFLAGS are not modified. */
2970 RT_NOREF_PV(pfEFlags);
2971}
2972
2973# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2974
2975
2976/*
2977 * NEG
2978 */
2979
2980/**
2981 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2982 *
2983 * @returns Status bits.
2984 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2985 * @param a_uResult Unsigned result value.
2986 * @param a_uDst The original destination value (for AF calc).
2987 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2988 */
2989#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2990 do { \
2991 uint32_t fEflTmp = *(a_pfEFlags); \
2992 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2993 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2994 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2995 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2996 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2997 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2998 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2999 *(a_pfEFlags) = fEflTmp; \
3000 } while (0)
3001
3002IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
3003{
3004 uint64_t uDst = *puDst;
3005 uint64_t uResult = (uint64_t)0 - uDst;
3006 *puDst = uResult;
3007 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
3008}
3009
3010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3011
3012IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
3013{
3014 uint32_t uDst = *puDst;
3015 uint32_t uResult = (uint32_t)0 - uDst;
3016 *puDst = uResult;
3017 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
3018}
3019
3020
3021IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
3022{
3023 uint16_t uDst = *puDst;
3024 uint16_t uResult = (uint16_t)0 - uDst;
3025 *puDst = uResult;
3026 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
3027}
3028
3029
3030IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
3031{
3032 uint8_t uDst = *puDst;
3033 uint8_t uResult = (uint8_t)0 - uDst;
3034 *puDst = uResult;
3035 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
3036}
3037
3038# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3039
3040/*
3041 * Locked variants.
3042 */
3043
3044/** Emit a function for doing a locked unary operand operation. */
3045# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
3046 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
3047 uint32_t *pfEFlags)) \
3048 { \
3049 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
3050 uint ## a_cBitsWidth ## _t uTmp; \
3051 uint32_t fEflTmp; \
3052 do \
3053 { \
3054 uTmp = uOld; \
3055 fEflTmp = *pfEFlags; \
3056 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
3057 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
3058 *pfEFlags = fEflTmp; \
3059 }
3060
3061EMIT_LOCKED_UNARY_OP(inc, 64)
3062EMIT_LOCKED_UNARY_OP(dec, 64)
3063EMIT_LOCKED_UNARY_OP(not, 64)
3064EMIT_LOCKED_UNARY_OP(neg, 64)
3065# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3066EMIT_LOCKED_UNARY_OP(inc, 32)
3067EMIT_LOCKED_UNARY_OP(dec, 32)
3068EMIT_LOCKED_UNARY_OP(not, 32)
3069EMIT_LOCKED_UNARY_OP(neg, 32)
3070
3071EMIT_LOCKED_UNARY_OP(inc, 16)
3072EMIT_LOCKED_UNARY_OP(dec, 16)
3073EMIT_LOCKED_UNARY_OP(not, 16)
3074EMIT_LOCKED_UNARY_OP(neg, 16)
3075
3076EMIT_LOCKED_UNARY_OP(inc, 8)
3077EMIT_LOCKED_UNARY_OP(dec, 8)
3078EMIT_LOCKED_UNARY_OP(not, 8)
3079EMIT_LOCKED_UNARY_OP(neg, 8)
3080# endif
3081
3082#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
3083
3084
3085/*********************************************************************************************************************************
3086* Shifting and Rotating *
3087*********************************************************************************************************************************/
3088
3089/*
3090 * ROL
3091 */
3092#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3093IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3094{ \
3095 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3096 if (cShift) \
3097 { \
3098 if (a_cBitsWidth < 32) \
3099 cShift &= a_cBitsWidth - 1; \
3100 a_uType const uDst = *puDst; \
3101 a_uType const uResult = a_fnHlp(uDst, cShift); \
3102 *puDst = uResult; \
3103 \
3104 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3105 it the same way as for 1 bit shifts. */ \
3106 AssertCompile(X86_EFL_CF_BIT == 0); \
3107 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3108 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3109 fEFlags |= fCarry; \
3110 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3111 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3112 else /* Intel 10980XE: According to the first sub-shift: */ \
3113 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3114 } \
3115 return fEFlags; \
3116}
3117
3118#ifndef RT_ARCH_ARM64
3119
3120# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3121EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3122# endif
3123EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3124EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3125
3126# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3127EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3128# endif
3129EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3130EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3131
3132DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3133{
3134 return (uValue << cShift) | (uValue >> (16 - cShift));
3135}
3136# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3137EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3138# endif
3139EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3140EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3141
3142DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3143{
3144 return (uValue << cShift) | (uValue >> (8 - cShift));
3145}
3146# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3147EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3148# endif
3149EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3150EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3151
3152#endif /* !RT_ARCH_ARM64 */
3153
3154/*
3155 * ROR
3156 */
3157#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3158IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3159{ \
3160 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3161 if (cShift) \
3162 { \
3163 if (a_cBitsWidth < 32) \
3164 cShift &= a_cBitsWidth - 1; \
3165 a_uType const uDst = *puDst; \
3166 a_uType const uResult = a_fnHlp(uDst, cShift); \
3167 *puDst = uResult; \
3168 \
3169 /* Calc EFLAGS: */ \
3170 AssertCompile(X86_EFL_CF_BIT == 0); \
3171 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3172 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3173 fEFlags |= fCarry; \
3174 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3175 fEFlags |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3176 else /* Intel 10980XE: According to the first sub-shift: */ \
3177 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3178 } \
3179 return fEFlags; \
3180}
3181
3182#ifndef RT_ARCH_ARM64
3183
3184# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3186# endif
3187EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3188EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3189
3190# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3192# endif
3193EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3194EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3195
3196DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3197{
3198 return (uValue >> cShift) | (uValue << (16 - cShift));
3199}
3200# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3201EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3202# endif
3203EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3204EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3205
3206DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3207{
3208 return (uValue >> cShift) | (uValue << (8 - cShift));
3209}
3210# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3211EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3212# endif
3213EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3214EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3215
3216#endif /* !RT_ARCH_ARM64 */
3217
3218/*
3219 * RCL
3220 */
3221#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3222IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3223{ \
3224 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3225 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3226 cShift %= a_cBitsWidth + 1; \
3227 if (cShift) \
3228 { \
3229 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3230 cShift %= a_cBitsWidth + 1; \
3231 a_uType const uDst = *puDst; \
3232 a_uType uResult = uDst << cShift; \
3233 if (cShift > 1) \
3234 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3235 \
3236 AssertCompile(X86_EFL_CF_BIT == 0); \
3237 uint32_t fInCarry = fEFlags & X86_EFL_CF; \
3238 uResult |= (a_uType)fInCarry << (cShift - 1); \
3239 \
3240 *puDst = uResult; \
3241 \
3242 /* Calc EFLAGS. */ \
3243 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3244 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3245 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3246 fEFlags |= fOutCarry; \
3247 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3248 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3249 else /* Intel 10980XE: According to the first sub-shift: */ \
3250 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3251 } \
3252 return fEFlags; \
3253}
3254
3255#ifndef RT_ARCH_ARM64
3256
3257# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3258EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3259# endif
3260EMIT_RCL(64, uint64_t, _intel, 1)
3261EMIT_RCL(64, uint64_t, _amd, 0)
3262
3263# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3264EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3265# endif
3266EMIT_RCL(32, uint32_t, _intel, 1)
3267EMIT_RCL(32, uint32_t, _amd, 0)
3268
3269# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3270EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3271# endif
3272EMIT_RCL(16, uint16_t, _intel, 1)
3273EMIT_RCL(16, uint16_t, _amd, 0)
3274
3275# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3276EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3277# endif
3278EMIT_RCL(8, uint8_t, _intel, 1)
3279EMIT_RCL(8, uint8_t, _amd, 0)
3280
3281#endif /* !RT_ARCH_ARM64 */
3282
3283
3284/*
3285 * RCR
3286 */
3287#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3288IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3289{ \
3290 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3291 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3292 cShift %= a_cBitsWidth + 1; \
3293 if (cShift) \
3294 { \
3295 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3296 cShift %= a_cBitsWidth + 1; \
3297 a_uType const uDst = *puDst; \
3298 a_uType uResult = uDst >> cShift; \
3299 if (cShift > 1) \
3300 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3301 \
3302 AssertCompile(X86_EFL_CF_BIT == 0); \
3303 uint32_t fInCarry = fEFlags & X86_EFL_CF; \
3304 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3305 *puDst = uResult; \
3306 \
3307 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3308 it the same way as for 1 bit shifts. */ \
3309 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3310 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3311 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3312 fEFlags |= fOutCarry; \
3313 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3314 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3315 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3316 fEFlags |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3317 } \
3318 return fEFlags; \
3319}
3320
3321#ifndef RT_ARCH_ARM64
3322
3323#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3324EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3325#endif
3326EMIT_RCR(64, uint64_t, _intel, 1)
3327EMIT_RCR(64, uint64_t, _amd, 0)
3328
3329# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3330EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3331# endif
3332EMIT_RCR(32, uint32_t, _intel, 1)
3333EMIT_RCR(32, uint32_t, _amd, 0)
3334
3335# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3336EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3337# endif
3338EMIT_RCR(16, uint16_t, _intel, 1)
3339EMIT_RCR(16, uint16_t, _amd, 0)
3340
3341# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3342EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3343# endif
3344EMIT_RCR(8, uint8_t, _intel, 1)
3345EMIT_RCR(8, uint8_t, _amd, 0)
3346
3347#endif /* !RT_ARCH_ARM64 */
3348
3349
3350/*
3351 * SHL
3352 */
3353#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3354IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3355{ \
3356 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3357 if (cShift) \
3358 { \
3359 a_uType const uDst = *puDst; \
3360 a_uType uResult = uDst << cShift; \
3361 *puDst = uResult; \
3362 \
3363 /* Calc EFLAGS. */ \
3364 AssertCompile(X86_EFL_CF_BIT == 0); \
3365 fEFlags &= ~X86_EFL_STATUS_BITS; \
3366 uint32_t const fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3367 fEFlags |= fCarry; \
3368 if (!a_fIntelFlags) \
3369 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3370 else \
3371 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3372 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3373 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3374 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3375 if (!a_fIntelFlags) \
3376 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3377 } \
3378 return fEFlags; \
3379}
3380
3381#if !defined(RT_ARCH_ARM64)
3382
3383# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3384EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3385# endif
3386EMIT_SHL(64, uint64_t, _intel, 1)
3387EMIT_SHL(64, uint64_t, _amd, 0)
3388
3389# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3390EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3391# endif
3392EMIT_SHL(32, uint32_t, _intel, 1)
3393EMIT_SHL(32, uint32_t, _amd, 0)
3394
3395# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3396EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3397# endif
3398EMIT_SHL(16, uint16_t, _intel, 1)
3399EMIT_SHL(16, uint16_t, _amd, 0)
3400
3401# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3402EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3403# endif
3404EMIT_SHL(8, uint8_t, _intel, 1)
3405EMIT_SHL(8, uint8_t, _amd, 0)
3406
3407#endif /* !RT_ARCH_ARM64 */
3408
3409
3410/*
3411 * SHR
3412 */
3413#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3414IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3415{ \
3416 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3417 if (cShift) \
3418 { \
3419 a_uType const uDst = *puDst; \
3420 a_uType uResult = uDst >> cShift; \
3421 *puDst = uResult; \
3422 \
3423 /* Calc EFLAGS. */ \
3424 AssertCompile(X86_EFL_CF_BIT == 0); \
3425 fEFlags &= ~X86_EFL_STATUS_BITS; \
3426 fEFlags |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3427 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3428 fEFlags |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3429 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3430 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3431 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3432 if (!a_fIntelFlags) \
3433 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3434 } \
3435 return fEFlags; \
3436}
3437
3438#if !defined(RT_ARCH_ARM64)
3439
3440# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3441EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3442# endif
3443EMIT_SHR(64, uint64_t, _intel, 1)
3444EMIT_SHR(64, uint64_t, _amd, 0)
3445
3446# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3447EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3448# endif
3449EMIT_SHR(32, uint32_t, _intel, 1)
3450EMIT_SHR(32, uint32_t, _amd, 0)
3451
3452# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3453EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3454# endif
3455EMIT_SHR(16, uint16_t, _intel, 1)
3456EMIT_SHR(16, uint16_t, _amd, 0)
3457
3458# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3460# endif
3461EMIT_SHR(8, uint8_t, _intel, 1)
3462EMIT_SHR(8, uint8_t, _amd, 0)
3463
3464#endif /* !RT_ARCH_ARM64 */
3465
3466
3467/*
3468 * SAR
3469 */
3470#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3471IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3472{ \
3473 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3474 if (cShift) \
3475 { \
3476 a_iType const iDst = (a_iType)*puDst; \
3477 a_uType uResult = iDst >> cShift; \
3478 *puDst = uResult; \
3479 \
3480 /* Calc EFLAGS. \
3481 Note! The OF flag is always zero because the result never differs from the input. */ \
3482 AssertCompile(X86_EFL_CF_BIT == 0); \
3483 fEFlags &= ~X86_EFL_STATUS_BITS; \
3484 fEFlags |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3485 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3486 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3487 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3488 if (!a_fIntelFlags) \
3489 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3490 } \
3491 return fEFlags; \
3492}
3493
3494#if !defined(RT_ARCH_ARM64)
3495
3496# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3498# endif
3499EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3500EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3501
3502# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3503EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3504# endif
3505EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3506EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3507
3508# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3509EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3510# endif
3511EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3512EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3513
3514# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3515EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3516# endif
3517EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3518EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3519
3520#endif /* !RT_ARCH_ARM64 */
3521
3522
3523/*
3524 * SHLD
3525 *
3526 * - CF is the last bit shifted out of puDst.
3527 * - AF is always cleared by Intel 10980XE.
3528 * - AF is always set by AMD 3990X.
3529 * - OF is set according to the first shift on Intel 10980XE, it seems.
3530 * - OF is set according to the last sub-shift on AMD 3990X.
3531 * - ZF, SF and PF are calculated according to the result by both vendors.
3532 *
3533 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3534 * pick either the source register or the destination register for input bits
3535 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3536 * intel has changed behaviour here several times. We implement what current
3537 * skylake based does for now, we can extend this later as needed.
3538 */
3539#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3540IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3541 uint32_t *pfEFlags)) \
3542{ \
3543 cShift &= a_cBitsWidth - 1; \
3544 if (cShift) \
3545 { \
3546 a_uType const uDst = *puDst; \
3547 a_uType uResult = uDst << cShift; \
3548 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3549 *puDst = uResult; \
3550 \
3551 /* CALC EFLAGS: */ \
3552 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3553 if (a_fIntelFlags) \
3554 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3555 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3556 else \
3557 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3558 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3559 fEfl |= X86_EFL_AF; \
3560 } \
3561 AssertCompile(X86_EFL_CF_BIT == 0); \
3562 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3563 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3564 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3565 fEfl |= X86_EFL_CALC_ZF(uResult); \
3566 *pfEFlags = fEfl; \
3567 } \
3568}
3569
3570#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3571EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3572#endif
3573EMIT_SHLD(64, uint64_t, _intel, 1)
3574EMIT_SHLD(64, uint64_t, _amd, 0)
3575
3576#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3577EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3578#endif
3579EMIT_SHLD(32, uint32_t, _intel, 1)
3580EMIT_SHLD(32, uint32_t, _amd, 0)
3581
3582#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3583IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3584{ \
3585 cShift &= 31; \
3586 if (cShift) \
3587 { \
3588 uint16_t const uDst = *puDst; \
3589 uint64_t const uTmp = a_fIntelFlags \
3590 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3591 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3592 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3593 *puDst = uResult; \
3594 \
3595 /* CALC EFLAGS: */ \
3596 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3597 AssertCompile(X86_EFL_CF_BIT == 0); \
3598 if (a_fIntelFlags) \
3599 { \
3600 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3601 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3602 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3603 } \
3604 else \
3605 { \
3606 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3607 if (cShift < 16) \
3608 { \
3609 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3610 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3611 } \
3612 else \
3613 { \
3614 if (cShift == 16) \
3615 fEfl |= uDst & X86_EFL_CF; \
3616 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3617 } \
3618 fEfl |= X86_EFL_AF; \
3619 } \
3620 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3621 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3622 fEfl |= X86_EFL_CALC_ZF(uResult); \
3623 *pfEFlags = fEfl; \
3624 } \
3625}
3626
3627#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3628EMIT_SHLD_16(RT_NOTHING, 1)
3629#endif
3630EMIT_SHLD_16(_intel, 1)
3631EMIT_SHLD_16(_amd, 0)
3632
3633
3634/*
3635 * SHRD
3636 *
3637 * EFLAGS behaviour seems to be the same as with SHLD:
3638 * - CF is the last bit shifted out of puDst.
3639 * - AF is always cleared by Intel 10980XE.
3640 * - AF is always set by AMD 3990X.
3641 * - OF is set according to the first shift on Intel 10980XE, it seems.
3642 * - OF is set according to the last sub-shift on AMD 3990X.
3643 * - ZF, SF and PF are calculated according to the result by both vendors.
3644 *
3645 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3646 * pick either the source register or the destination register for input bits
3647 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3648 * intel has changed behaviour here several times. We implement what current
3649 * skylake based does for now, we can extend this later as needed.
3650 */
3651#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3652IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3653{ \
3654 cShift &= a_cBitsWidth - 1; \
3655 if (cShift) \
3656 { \
3657 a_uType const uDst = *puDst; \
3658 a_uType uResult = uDst >> cShift; \
3659 uResult |= uSrc << (a_cBitsWidth - cShift); \
3660 *puDst = uResult; \
3661 \
3662 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3663 AssertCompile(X86_EFL_CF_BIT == 0); \
3664 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3665 if (a_fIntelFlags) \
3666 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3667 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3668 else \
3669 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3670 if (cShift > 1) /* Set according to last shift. */ \
3671 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3672 else \
3673 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3674 fEfl |= X86_EFL_AF; \
3675 } \
3676 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3677 fEfl |= X86_EFL_CALC_ZF(uResult); \
3678 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3679 *pfEFlags = fEfl; \
3680 } \
3681}
3682
3683#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3684EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3685#endif
3686EMIT_SHRD(64, uint64_t, _intel, 1)
3687EMIT_SHRD(64, uint64_t, _amd, 0)
3688
3689#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3690EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3691#endif
3692EMIT_SHRD(32, uint32_t, _intel, 1)
3693EMIT_SHRD(32, uint32_t, _amd, 0)
3694
3695#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3696IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3697{ \
3698 cShift &= 31; \
3699 if (cShift) \
3700 { \
3701 uint16_t const uDst = *puDst; \
3702 uint64_t const uTmp = a_fIntelFlags \
3703 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3704 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3705 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3706 *puDst = uResult; \
3707 \
3708 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3709 AssertCompile(X86_EFL_CF_BIT == 0); \
3710 if (a_fIntelFlags) \
3711 { \
3712 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3713 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3714 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3715 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3716 } \
3717 else \
3718 { \
3719 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3720 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3721 /* AMD 3990X: Set according to last shift. AF always set. */ \
3722 if (cShift > 1) /* Set according to last shift. */ \
3723 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3724 else \
3725 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3726 fEfl |= X86_EFL_AF; \
3727 } \
3728 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3729 fEfl |= X86_EFL_CALC_ZF(uResult); \
3730 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3731 *pfEFlags = fEfl; \
3732 } \
3733}
3734
3735#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3736EMIT_SHRD_16(RT_NOTHING, 1)
3737#endif
3738EMIT_SHRD_16(_intel, 1)
3739EMIT_SHRD_16(_amd, 0)
3740
3741
3742/*
3743 * RORX (BMI2)
3744 */
3745#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3746IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3747{ \
3748 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3749}
3750
3751#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3752EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3753#endif
3754#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3755EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3756#endif
3757
3758
3759/*
3760 * SHLX (BMI2)
3761 */
3762#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3763IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3764{ \
3765 cShift &= a_cBitsWidth - 1; \
3766 *puDst = uSrc << cShift; \
3767}
3768
3769#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3770EMIT_SHLX(64, uint64_t, RT_NOTHING)
3771EMIT_SHLX(64, uint64_t, _fallback)
3772#endif
3773#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3774EMIT_SHLX(32, uint32_t, RT_NOTHING)
3775EMIT_SHLX(32, uint32_t, _fallback)
3776#endif
3777
3778
3779/*
3780 * SHRX (BMI2)
3781 */
3782#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3783IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3784{ \
3785 cShift &= a_cBitsWidth - 1; \
3786 *puDst = uSrc >> cShift; \
3787}
3788
3789#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3790EMIT_SHRX(64, uint64_t, RT_NOTHING)
3791EMIT_SHRX(64, uint64_t, _fallback)
3792#endif
3793#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3794EMIT_SHRX(32, uint32_t, RT_NOTHING)
3795EMIT_SHRX(32, uint32_t, _fallback)
3796#endif
3797
3798
3799/*
3800 * SARX (BMI2)
3801 */
3802#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3803IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3804{ \
3805 cShift &= a_cBitsWidth - 1; \
3806 *puDst = (a_iType)uSrc >> cShift; \
3807}
3808
3809#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3810EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3811EMIT_SARX(64, uint64_t, int64_t, _fallback)
3812#endif
3813#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3814EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3815EMIT_SARX(32, uint32_t, int32_t, _fallback)
3816#endif
3817
3818
3819/*
3820 * PDEP (BMI2)
3821 */
3822#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3823IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3824{ \
3825 a_uType uResult = 0; \
3826 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3827 if (fMask & ((a_uType)1 << iMaskBit)) \
3828 { \
3829 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3830 iBit++; \
3831 } \
3832 *puDst = uResult; \
3833}
3834
3835#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3836EMIT_PDEP(64, uint64_t, RT_NOTHING)
3837#endif
3838EMIT_PDEP(64, uint64_t, _fallback)
3839#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3840EMIT_PDEP(32, uint32_t, RT_NOTHING)
3841#endif
3842EMIT_PDEP(32, uint32_t, _fallback)
3843
3844/*
3845 * PEXT (BMI2)
3846 */
3847#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3849{ \
3850 a_uType uResult = 0; \
3851 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3852 if (fMask & ((a_uType)1 << iMaskBit)) \
3853 { \
3854 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3855 iBit++; \
3856 } \
3857 *puDst = uResult; \
3858}
3859
3860#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3861EMIT_PEXT(64, uint64_t, RT_NOTHING)
3862#endif
3863EMIT_PEXT(64, uint64_t, _fallback)
3864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3865EMIT_PEXT(32, uint32_t, RT_NOTHING)
3866#endif
3867EMIT_PEXT(32, uint32_t, _fallback)
3868
3869
3870#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3871
3872# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3873/*
3874 * BSWAP
3875 */
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3878{
3879 *puDst = ASMByteSwapU64(*puDst);
3880}
3881
3882
3883IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3884{
3885 *puDst = ASMByteSwapU32(*puDst);
3886}
3887
3888
3889/* Note! undocument, so 32-bit arg */
3890IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3891{
3892#if 0
3893 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3894#else
3895 /* This is the behaviour AMD 3990x (64-bit mode): */
3896 *(uint16_t *)puDst = 0;
3897#endif
3898}
3899
3900# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3901
3902
3903
3904# if defined(IEM_WITHOUT_ASSEMBLY)
3905
3906/*
3907 * LFENCE, SFENCE & MFENCE.
3908 */
3909
3910IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3911{
3912 ASMReadFence();
3913}
3914
3915
3916IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3917{
3918 ASMWriteFence();
3919}
3920
3921
3922IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3923{
3924 ASMMemoryFence();
3925}
3926
3927
3928# ifndef RT_ARCH_ARM64
3929IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3930{
3931 ASMMemoryFence();
3932}
3933# endif
3934
3935# endif
3936
3937#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3938
3939
3940IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_arpl,(uint32_t fEFlags, uint16_t *pu16Dst, uint16_t u16Src))
3941{
3942 uint16_t u16Dst = *pu16Dst;
3943 if ((u16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3944 {
3945 u16Dst &= X86_SEL_MASK_OFF_RPL;
3946 u16Dst |= u16Src & X86_SEL_RPL;
3947 *pu16Dst = u16Dst;
3948
3949 fEFlags |= X86_EFL_ZF;
3950 }
3951 else
3952 fEFlags &= ~X86_EFL_ZF;
3953 return fEFlags;
3954}
3955
3956
3957#if defined(IEM_WITHOUT_ASSEMBLY)
3958
3959/*********************************************************************************************************************************
3960* x87 FPU Loads *
3961*********************************************************************************************************************************/
3962
3963IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3964{
3965 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3966 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3967 {
3968 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3969 pFpuRes->r80Result.sj64.fInteger = 1;
3970 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3971 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3972 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3973 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3974 }
3975 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3976 {
3977 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3978 pFpuRes->r80Result.s.uExponent = 0;
3979 pFpuRes->r80Result.s.uMantissa = 0;
3980 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3981 }
3982 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3983 {
3984 /* Subnormal values gets normalized. */
3985 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3986 pFpuRes->r80Result.sj64.fInteger = 1;
3987 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3988 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3989 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3990 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3991 pFpuRes->FSW |= X86_FSW_DE;
3992 if (!(pFpuState->FCW & X86_FCW_DM))
3993 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3994 }
3995 else if (RTFLOAT32U_IS_INF(pr32Val))
3996 {
3997 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3998 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3999 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
4000 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
4001 }
4002 else
4003 {
4004 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
4005 Assert(RTFLOAT32U_IS_NAN(pr32Val));
4006 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
4007 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
4008 pFpuRes->r80Result.sj64.fInteger = 1;
4009 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
4010 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
4011 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
4012 {
4013 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4014 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4015 pFpuRes->FSW |= X86_FSW_IE;
4016
4017 if (!(pFpuState->FCW & X86_FCW_IM))
4018 {
4019 /* The value is not pushed. */
4020 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4021 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4022 pFpuRes->r80Result.au64[0] = 0;
4023 pFpuRes->r80Result.au16[4] = 0;
4024 }
4025 }
4026 else
4027 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4028 }
4029}
4030
4031
4032IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
4033{
4034 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4035 if (RTFLOAT64U_IS_NORMAL(pr64Val))
4036 {
4037 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4038 pFpuRes->r80Result.sj64.fInteger = 1;
4039 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4040 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
4041 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
4042 }
4043 else if (RTFLOAT64U_IS_ZERO(pr64Val))
4044 {
4045 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4046 pFpuRes->r80Result.s.uExponent = 0;
4047 pFpuRes->r80Result.s.uMantissa = 0;
4048 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
4049 }
4050 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
4051 {
4052 /* Subnormal values gets normalized. */
4053 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4054 pFpuRes->r80Result.sj64.fInteger = 1;
4055 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
4056 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
4057 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
4058 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
4059 pFpuRes->FSW |= X86_FSW_DE;
4060 if (!(pFpuState->FCW & X86_FCW_DM))
4061 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
4062 }
4063 else if (RTFLOAT64U_IS_INF(pr64Val))
4064 {
4065 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4066 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
4067 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
4068 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
4069 }
4070 else
4071 {
4072 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
4073 Assert(RTFLOAT64U_IS_NAN(pr64Val));
4074 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4075 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
4076 pFpuRes->r80Result.sj64.fInteger = 1;
4077 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4078 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
4079 {
4080 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4081 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4082 pFpuRes->FSW |= X86_FSW_IE;
4083
4084 if (!(pFpuState->FCW & X86_FCW_IM))
4085 {
4086 /* The value is not pushed. */
4087 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4088 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4089 pFpuRes->r80Result.au64[0] = 0;
4090 pFpuRes->r80Result.au16[4] = 0;
4091 }
4092 }
4093 else
4094 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4095 }
4096}
4097
4098
4099IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
4100{
4101 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
4102 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
4103 /* Raises no exceptions. */
4104 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4105}
4106
4107
4108IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4109{
4110 pFpuRes->r80Result.sj64.fSign = 0;
4111 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4112 pFpuRes->r80Result.sj64.fInteger = 1;
4113 pFpuRes->r80Result.sj64.uFraction = 0;
4114
4115 /*
4116 * FPU status word:
4117 * - TOP is irrelevant, but we must match x86 assembly version.
4118 * - C1 is always cleared as we don't have any stack overflows.
4119 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4120 */
4121 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4122}
4123
4124
4125IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4126{
4127 pFpuRes->r80Result.sj64.fSign = 0;
4128 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4129 pFpuRes->r80Result.sj64.fInteger = 1;
4130 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4131 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4132 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4133 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4134}
4135
4136
4137IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4138{
4139 pFpuRes->r80Result.sj64.fSign = 0;
4140 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4141 pFpuRes->r80Result.sj64.fInteger = 1;
4142 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4143 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4144 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4145}
4146
4147
4148IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4149{
4150 pFpuRes->r80Result.sj64.fSign = 0;
4151 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4152 pFpuRes->r80Result.sj64.fInteger = 1;
4153 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4154 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4155 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4156 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4157}
4158
4159
4160IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4161{
4162 pFpuRes->r80Result.sj64.fSign = 0;
4163 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4164 pFpuRes->r80Result.sj64.fInteger = 1;
4165 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4166 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4167 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4168 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4169}
4170
4171
4172IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4173{
4174 pFpuRes->r80Result.sj64.fSign = 0;
4175 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4176 pFpuRes->r80Result.sj64.fInteger = 1;
4177 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4178 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4179 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4180 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4181}
4182
4183
4184IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4185{
4186 pFpuRes->r80Result.s.fSign = 0;
4187 pFpuRes->r80Result.s.uExponent = 0;
4188 pFpuRes->r80Result.s.uMantissa = 0;
4189 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4190}
4191
4192#define EMIT_FILD(a_cBits) \
4193IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4194 int ## a_cBits ## _t const *piVal)) \
4195{ \
4196 int ## a_cBits ## _t iVal = *piVal; \
4197 if (iVal == 0) \
4198 { \
4199 pFpuRes->r80Result.s.fSign = 0; \
4200 pFpuRes->r80Result.s.uExponent = 0; \
4201 pFpuRes->r80Result.s.uMantissa = 0; \
4202 } \
4203 else \
4204 { \
4205 if (iVal > 0) \
4206 pFpuRes->r80Result.s.fSign = 0; \
4207 else \
4208 { \
4209 pFpuRes->r80Result.s.fSign = 1; \
4210 iVal = -iVal; \
4211 } \
4212 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4213 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4214 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4215 } \
4216 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4217}
4218EMIT_FILD(16)
4219EMIT_FILD(32)
4220EMIT_FILD(64)
4221
4222
4223IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4224{
4225 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4226 if ( pd80Val->s.abPairs[0] == 0
4227 && pd80Val->s.abPairs[1] == 0
4228 && pd80Val->s.abPairs[2] == 0
4229 && pd80Val->s.abPairs[3] == 0
4230 && pd80Val->s.abPairs[4] == 0
4231 && pd80Val->s.abPairs[5] == 0
4232 && pd80Val->s.abPairs[6] == 0
4233 && pd80Val->s.abPairs[7] == 0
4234 && pd80Val->s.abPairs[8] == 0)
4235 {
4236 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4237 pFpuRes->r80Result.s.uExponent = 0;
4238 pFpuRes->r80Result.s.uMantissa = 0;
4239 }
4240 else
4241 {
4242 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4243
4244 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4245 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4246 cPairs--;
4247
4248 uint64_t uVal = 0;
4249 uint64_t uFactor = 1;
4250 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4251 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4252 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4253
4254 unsigned const cBits = ASMBitLastSetU64(uVal);
4255 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4256 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4257 }
4258}
4259
4260
4261/*********************************************************************************************************************************
4262* x87 FPU Stores *
4263*********************************************************************************************************************************/
4264
4265/**
4266 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4267 *
4268 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4269 *
4270 * @returns Updated FPU status word value.
4271 * @param fSignIn Incoming sign indicator.
4272 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4273 * @param iExponentIn Unbiased exponent.
4274 * @param fFcw The FPU control word.
4275 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4276 * @param pr32Dst Where to return the output value, if one should be
4277 * returned.
4278 *
4279 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4280 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4281 */
4282static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4283 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4284{
4285 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4286 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4287 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4288 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4289 ? fRoundingOffMask
4290 : 0;
4291 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4292
4293 /*
4294 * Deal with potential overflows/underflows first, optimizing for none.
4295 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4296 */
4297 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4298 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4299 { /* likely? */ }
4300 /*
4301 * Underflow if the exponent zero or negative. This is attempted mapped
4302 * to a subnormal number when possible, with some additional trickery ofc.
4303 */
4304 else if (iExponentOut <= 0)
4305 {
4306 bool const fIsTiny = iExponentOut < 0
4307 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4308 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4309 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4310 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4311
4312 if (iExponentOut <= 0)
4313 {
4314 uMantissaIn = iExponentOut <= -63
4315 ? uMantissaIn != 0
4316 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4317 fRoundedOff = uMantissaIn & fRoundingOffMask;
4318 if (fRoundedOff && fIsTiny)
4319 fFsw |= X86_FSW_UE;
4320 iExponentOut = 0;
4321 }
4322 }
4323 /*
4324 * Overflow if at or above max exponent value or if we will reach max
4325 * when rounding. Will return +/-zero or +/-max value depending on
4326 * whether we're rounding or not.
4327 */
4328 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4329 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4330 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4331 {
4332 fFsw |= X86_FSW_OE;
4333 if (!(fFcw & X86_FCW_OM))
4334 return fFsw | X86_FSW_ES | X86_FSW_B;
4335 fFsw |= X86_FSW_PE;
4336 if (uRoundingAdd)
4337 fFsw |= X86_FSW_C1;
4338 if (!(fFcw & X86_FCW_PM))
4339 fFsw |= X86_FSW_ES | X86_FSW_B;
4340
4341 pr32Dst->s.fSign = fSignIn;
4342 if (uRoundingAdd)
4343 { /* Zero */
4344 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4345 pr32Dst->s.uFraction = 0;
4346 }
4347 else
4348 { /* Max */
4349 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4350 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4351 }
4352 return fFsw;
4353 }
4354
4355 /*
4356 * Normal or subnormal number.
4357 */
4358 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4359 uint64_t uMantissaOut = uMantissaIn;
4360 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4361 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4362 || fRoundedOff != uRoundingAdd)
4363 {
4364 uMantissaOut = uMantissaIn + uRoundingAdd;
4365 if (uMantissaOut >= uMantissaIn)
4366 { /* likely */ }
4367 else
4368 {
4369 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4370 iExponentOut++;
4371 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4372 fFsw |= X86_FSW_C1;
4373 }
4374 }
4375 else
4376 uMantissaOut = uMantissaIn;
4377
4378 /* Truncate the mantissa and set the return value. */
4379 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4380
4381 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4382 pr32Dst->s.uExponent = iExponentOut;
4383 pr32Dst->s.fSign = fSignIn;
4384
4385 /* Set status flags realted to rounding. */
4386 if (fRoundedOff)
4387 {
4388 fFsw |= X86_FSW_PE;
4389 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4390 fFsw |= X86_FSW_C1;
4391 if (!(fFcw & X86_FCW_PM))
4392 fFsw |= X86_FSW_ES | X86_FSW_B;
4393 }
4394
4395 return fFsw;
4396}
4397
4398
4399/**
4400 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4401 */
4402IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4403 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4404{
4405 uint16_t const fFcw = pFpuState->FCW;
4406 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4407 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4408 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4409 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4410 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4411 {
4412 pr32Dst->s.fSign = pr80Src->s.fSign;
4413 pr32Dst->s.uExponent = 0;
4414 pr32Dst->s.uFraction = 0;
4415 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4416 }
4417 else if (RTFLOAT80U_IS_INF(pr80Src))
4418 {
4419 pr32Dst->s.fSign = pr80Src->s.fSign;
4420 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4421 pr32Dst->s.uFraction = 0;
4422 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4423 }
4424 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4425 {
4426 /* Mapped to +/-QNaN */
4427 pr32Dst->s.fSign = pr80Src->s.fSign;
4428 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4429 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4430 }
4431 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4432 {
4433 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4434 if (fFcw & X86_FCW_IM)
4435 {
4436 pr32Dst->s.fSign = 1;
4437 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4438 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4439 fFsw |= X86_FSW_IE;
4440 }
4441 else
4442 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4443 }
4444 else if (RTFLOAT80U_IS_NAN(pr80Src))
4445 {
4446 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4447 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4448 {
4449 pr32Dst->s.fSign = pr80Src->s.fSign;
4450 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4451 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4452 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4453 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4454 fFsw |= X86_FSW_IE;
4455 }
4456 else
4457 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4458 }
4459 else
4460 {
4461 /* Denormal values causes both an underflow and precision exception. */
4462 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4463 if (fFcw & X86_FCW_UM)
4464 {
4465 pr32Dst->s.fSign = pr80Src->s.fSign;
4466 pr32Dst->s.uExponent = 0;
4467 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4468 {
4469 pr32Dst->s.uFraction = 1;
4470 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4471 if (!(fFcw & X86_FCW_PM))
4472 fFsw |= X86_FSW_ES | X86_FSW_B;
4473 }
4474 else
4475 {
4476 pr32Dst->s.uFraction = 0;
4477 fFsw |= X86_FSW_UE | X86_FSW_PE;
4478 if (!(fFcw & X86_FCW_PM))
4479 fFsw |= X86_FSW_ES | X86_FSW_B;
4480 }
4481 }
4482 else
4483 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4484 }
4485 *pu16FSW = fFsw;
4486}
4487
4488
4489/**
4490 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4491 *
4492 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4493 *
4494 * @returns Updated FPU status word value.
4495 * @param fSignIn Incoming sign indicator.
4496 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4497 * @param iExponentIn Unbiased exponent.
4498 * @param fFcw The FPU control word.
4499 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4500 * @param pr64Dst Where to return the output value, if one should be
4501 * returned.
4502 *
4503 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4504 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4505 */
4506static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4507 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4508{
4509 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4510 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4511 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4512 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4513 ? fRoundingOffMask
4514 : 0;
4515 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4516
4517 /*
4518 * Deal with potential overflows/underflows first, optimizing for none.
4519 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4520 */
4521 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4522 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4523 { /* likely? */ }
4524 /*
4525 * Underflow if the exponent zero or negative. This is attempted mapped
4526 * to a subnormal number when possible, with some additional trickery ofc.
4527 */
4528 else if (iExponentOut <= 0)
4529 {
4530 bool const fIsTiny = iExponentOut < 0
4531 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4532 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4533 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4534 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4535
4536 if (iExponentOut <= 0)
4537 {
4538 uMantissaIn = iExponentOut <= -63
4539 ? uMantissaIn != 0
4540 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4541 fRoundedOff = uMantissaIn & fRoundingOffMask;
4542 if (fRoundedOff && fIsTiny)
4543 fFsw |= X86_FSW_UE;
4544 iExponentOut = 0;
4545 }
4546 }
4547 /*
4548 * Overflow if at or above max exponent value or if we will reach max
4549 * when rounding. Will return +/-zero or +/-max value depending on
4550 * whether we're rounding or not.
4551 */
4552 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4553 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4554 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4555 {
4556 fFsw |= X86_FSW_OE;
4557 if (!(fFcw & X86_FCW_OM))
4558 return fFsw | X86_FSW_ES | X86_FSW_B;
4559 fFsw |= X86_FSW_PE;
4560 if (uRoundingAdd)
4561 fFsw |= X86_FSW_C1;
4562 if (!(fFcw & X86_FCW_PM))
4563 fFsw |= X86_FSW_ES | X86_FSW_B;
4564
4565 pr64Dst->s64.fSign = fSignIn;
4566 if (uRoundingAdd)
4567 { /* Zero */
4568 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4569 pr64Dst->s64.uFraction = 0;
4570 }
4571 else
4572 { /* Max */
4573 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4574 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4575 }
4576 return fFsw;
4577 }
4578
4579 /*
4580 * Normal or subnormal number.
4581 */
4582 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4583 uint64_t uMantissaOut = uMantissaIn;
4584 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4585 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4586 || fRoundedOff != uRoundingAdd)
4587 {
4588 uMantissaOut = uMantissaIn + uRoundingAdd;
4589 if (uMantissaOut >= uMantissaIn)
4590 { /* likely */ }
4591 else
4592 {
4593 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4594 iExponentOut++;
4595 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4596 fFsw |= X86_FSW_C1;
4597 }
4598 }
4599 else
4600 uMantissaOut = uMantissaIn;
4601
4602 /* Truncate the mantissa and set the return value. */
4603 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4604
4605 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4606 pr64Dst->s64.uExponent = iExponentOut;
4607 pr64Dst->s64.fSign = fSignIn;
4608
4609 /* Set status flags realted to rounding. */
4610 if (fRoundedOff)
4611 {
4612 fFsw |= X86_FSW_PE;
4613 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4614 fFsw |= X86_FSW_C1;
4615 if (!(fFcw & X86_FCW_PM))
4616 fFsw |= X86_FSW_ES | X86_FSW_B;
4617 }
4618
4619 return fFsw;
4620}
4621
4622
4623/**
4624 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4625 */
4626IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4627 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4628{
4629 uint16_t const fFcw = pFpuState->FCW;
4630 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4631 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4632 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4633 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4634 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4635 {
4636 pr64Dst->s64.fSign = pr80Src->s.fSign;
4637 pr64Dst->s64.uExponent = 0;
4638 pr64Dst->s64.uFraction = 0;
4639 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4640 }
4641 else if (RTFLOAT80U_IS_INF(pr80Src))
4642 {
4643 pr64Dst->s64.fSign = pr80Src->s.fSign;
4644 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4645 pr64Dst->s64.uFraction = 0;
4646 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4647 }
4648 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4649 {
4650 /* Mapped to +/-QNaN */
4651 pr64Dst->s64.fSign = pr80Src->s.fSign;
4652 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4653 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4654 }
4655 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4656 {
4657 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4658 if (fFcw & X86_FCW_IM)
4659 {
4660 pr64Dst->s64.fSign = 1;
4661 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4662 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4663 fFsw |= X86_FSW_IE;
4664 }
4665 else
4666 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4667 }
4668 else if (RTFLOAT80U_IS_NAN(pr80Src))
4669 {
4670 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4671 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4672 {
4673 pr64Dst->s64.fSign = pr80Src->s.fSign;
4674 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4675 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4676 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4677 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4678 fFsw |= X86_FSW_IE;
4679 }
4680 else
4681 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4682 }
4683 else
4684 {
4685 /* Denormal values causes both an underflow and precision exception. */
4686 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4687 if (fFcw & X86_FCW_UM)
4688 {
4689 pr64Dst->s64.fSign = pr80Src->s.fSign;
4690 pr64Dst->s64.uExponent = 0;
4691 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4692 {
4693 pr64Dst->s64.uFraction = 1;
4694 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4695 if (!(fFcw & X86_FCW_PM))
4696 fFsw |= X86_FSW_ES | X86_FSW_B;
4697 }
4698 else
4699 {
4700 pr64Dst->s64.uFraction = 0;
4701 fFsw |= X86_FSW_UE | X86_FSW_PE;
4702 if (!(fFcw & X86_FCW_PM))
4703 fFsw |= X86_FSW_ES | X86_FSW_B;
4704 }
4705 }
4706 else
4707 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4708 }
4709 *pu16FSW = fFsw;
4710}
4711
4712
4713IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4714 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4715{
4716 /*
4717 * FPU status word:
4718 * - TOP is irrelevant, but we must match x86 assembly version (0).
4719 * - C1 is always cleared as we don't have any stack overflows.
4720 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4721 */
4722 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4723 *pr80Dst = *pr80Src;
4724}
4725
4726
4727/*
4728 *
4729 * Mantissa:
4730 * 63 56 48 40 32 24 16 8 0
4731 * v v v v v v v v v
4732 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4733 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4734 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4735 *
4736 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4737 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4738 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4739 * where we'll drop off all but bit 63.
4740 */
4741#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4742IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4743 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4744{ \
4745 uint16_t const fFcw = pFpuState->FCW; \
4746 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4747 bool const fSignIn = pr80Val->s.fSign; \
4748 \
4749 /* \
4750 * Deal with normal numbers first. \
4751 */ \
4752 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4753 { \
4754 uint64_t uMantissa = pr80Val->s.uMantissa; \
4755 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4756 \
4757 if ((uint32_t)iExponent <= a_cBits - 2) \
4758 { \
4759 unsigned const cShiftOff = 63 - iExponent; \
4760 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4761 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4762 ? RT_BIT_64(cShiftOff - 1) \
4763 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4764 ? fRoundingOffMask \
4765 : 0; \
4766 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4767 \
4768 uMantissa >>= cShiftOff; \
4769 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4770 uMantissa += uRounding; \
4771 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4772 { \
4773 if (fRoundedOff) \
4774 { \
4775 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4776 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4777 else if (uRounding) \
4778 fFsw |= X86_FSW_C1; \
4779 fFsw |= X86_FSW_PE; \
4780 if (!(fFcw & X86_FCW_PM)) \
4781 fFsw |= X86_FSW_ES | X86_FSW_B; \
4782 } \
4783 \
4784 if (!fSignIn) \
4785 *piDst = (a_iType)uMantissa; \
4786 else \
4787 *piDst = -(a_iType)uMantissa; \
4788 } \
4789 else \
4790 { \
4791 /* overflowed after rounding. */ \
4792 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4793 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4794 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4795 \
4796 /* Special case for the integer minimum value. */ \
4797 if (fSignIn) \
4798 { \
4799 *piDst = a_iTypeMin; \
4800 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4801 if (!(fFcw & X86_FCW_PM)) \
4802 fFsw |= X86_FSW_ES | X86_FSW_B; \
4803 } \
4804 else \
4805 { \
4806 fFsw |= X86_FSW_IE; \
4807 if (fFcw & X86_FCW_IM) \
4808 *piDst = a_iTypeMin; \
4809 else \
4810 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4811 } \
4812 } \
4813 } \
4814 /* \
4815 * Tiny sub-zero numbers. \
4816 */ \
4817 else if (iExponent < 0) \
4818 { \
4819 if (!fSignIn) \
4820 { \
4821 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4822 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4823 { \
4824 *piDst = 1; \
4825 fFsw |= X86_FSW_C1; \
4826 } \
4827 else \
4828 *piDst = 0; \
4829 } \
4830 else \
4831 { \
4832 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4833 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4834 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4835 *piDst = 0; \
4836 else \
4837 { \
4838 *piDst = -1; \
4839 fFsw |= X86_FSW_C1; \
4840 } \
4841 } \
4842 fFsw |= X86_FSW_PE; \
4843 if (!(fFcw & X86_FCW_PM)) \
4844 fFsw |= X86_FSW_ES | X86_FSW_B; \
4845 } \
4846 /* \
4847 * Special MIN case. \
4848 */ \
4849 else if ( fSignIn && iExponent == a_cBits - 1 \
4850 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4851 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4852 : uMantissa == RT_BIT_64(63))) \
4853 { \
4854 *piDst = a_iTypeMin; \
4855 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4856 { \
4857 fFsw |= X86_FSW_PE; \
4858 if (!(fFcw & X86_FCW_PM)) \
4859 fFsw |= X86_FSW_ES | X86_FSW_B; \
4860 } \
4861 } \
4862 /* \
4863 * Too large/small number outside the target integer range. \
4864 */ \
4865 else \
4866 { \
4867 fFsw |= X86_FSW_IE; \
4868 if (fFcw & X86_FCW_IM) \
4869 *piDst = a_iTypeIndefinite; \
4870 else \
4871 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4872 } \
4873 } \
4874 /* \
4875 * Map both +0 and -0 to integer zero (signless/+). \
4876 */ \
4877 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4878 *piDst = 0; \
4879 /* \
4880 * Denormals are just really tiny sub-zero numbers that are either rounded \
4881 * to zero, 1 or -1 depending on sign and rounding control. \
4882 */ \
4883 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4884 { \
4885 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4886 *piDst = 0; \
4887 else \
4888 { \
4889 *piDst = fSignIn ? -1 : 1; \
4890 fFsw |= X86_FSW_C1; \
4891 } \
4892 fFsw |= X86_FSW_PE; \
4893 if (!(fFcw & X86_FCW_PM)) \
4894 fFsw |= X86_FSW_ES | X86_FSW_B; \
4895 } \
4896 /* \
4897 * All other special values are considered invalid arguments and result \
4898 * in an IE exception and indefinite value if masked. \
4899 */ \
4900 else \
4901 { \
4902 fFsw |= X86_FSW_IE; \
4903 if (fFcw & X86_FCW_IM) \
4904 *piDst = a_iTypeIndefinite; \
4905 else \
4906 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4907 } \
4908 *pu16FSW = fFsw; \
4909}
4910EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4911EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4912EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4913
4914#endif /*IEM_WITHOUT_ASSEMBLY */
4915
4916
4917/*
4918 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4919 *
4920 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4921 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4922 * thus the @a a_cBitsIn.
4923 */
4924#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4925IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4926 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4927{ \
4928 uint16_t const fFcw = pFpuState->FCW; \
4929 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4930 bool const fSignIn = pr80Val->s.fSign; \
4931 \
4932 /* \
4933 * Deal with normal numbers first. \
4934 */ \
4935 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4936 { \
4937 uint64_t uMantissa = pr80Val->s.uMantissa; \
4938 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4939 \
4940 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4941 { \
4942 unsigned const cShiftOff = 63 - iExponent; \
4943 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4944 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4945 uMantissa >>= cShiftOff; \
4946 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4947 if (!fSignIn) \
4948 *piDst = (a_iType)uMantissa; \
4949 else \
4950 *piDst = -(a_iType)uMantissa; \
4951 \
4952 if (fRoundedOff) \
4953 { \
4954 fFsw |= X86_FSW_PE; \
4955 if (!(fFcw & X86_FCW_PM)) \
4956 fFsw |= X86_FSW_ES | X86_FSW_B; \
4957 } \
4958 } \
4959 /* \
4960 * Tiny sub-zero numbers. \
4961 */ \
4962 else if (iExponent < 0) \
4963 { \
4964 *piDst = 0; \
4965 fFsw |= X86_FSW_PE; \
4966 if (!(fFcw & X86_FCW_PM)) \
4967 fFsw |= X86_FSW_ES | X86_FSW_B; \
4968 } \
4969 /* \
4970 * Special MIN case. \
4971 */ \
4972 else if ( fSignIn && iExponent == a_cBits - 1 \
4973 && (a_cBits < 64 \
4974 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4975 : uMantissa == RT_BIT_64(63)) ) \
4976 { \
4977 *piDst = a_iTypeMin; \
4978 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4979 { \
4980 fFsw |= X86_FSW_PE; \
4981 if (!(fFcw & X86_FCW_PM)) \
4982 fFsw |= X86_FSW_ES | X86_FSW_B; \
4983 } \
4984 } \
4985 /* \
4986 * Figure this weirdness. \
4987 */ \
4988 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4989 { \
4990 *piDst = 0; \
4991 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4992 { \
4993 fFsw |= X86_FSW_PE; \
4994 if (!(fFcw & X86_FCW_PM)) \
4995 fFsw |= X86_FSW_ES | X86_FSW_B; \
4996 } \
4997 } \
4998 /* \
4999 * Too large/small number outside the target integer range. \
5000 */ \
5001 else \
5002 { \
5003 fFsw |= X86_FSW_IE; \
5004 if (fFcw & X86_FCW_IM) \
5005 *piDst = a_iTypeIndefinite; \
5006 else \
5007 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
5008 } \
5009 } \
5010 /* \
5011 * Map both +0 and -0 to integer zero (signless/+). \
5012 */ \
5013 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
5014 *piDst = 0; \
5015 /* \
5016 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
5017 */ \
5018 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
5019 { \
5020 *piDst = 0; \
5021 fFsw |= X86_FSW_PE; \
5022 if (!(fFcw & X86_FCW_PM)) \
5023 fFsw |= X86_FSW_ES | X86_FSW_B; \
5024 } \
5025 /* \
5026 * All other special values are considered invalid arguments and result \
5027 * in an IE exception and indefinite value if masked. \
5028 */ \
5029 else \
5030 { \
5031 fFsw |= X86_FSW_IE; \
5032 if (fFcw & X86_FCW_IM) \
5033 *piDst = a_iTypeIndefinite; \
5034 else \
5035 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
5036 } \
5037 *pu16FSW = fFsw; \
5038}
5039#if defined(IEM_WITHOUT_ASSEMBLY)
5040EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
5041EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
5042EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
5043#endif
5044EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
5045EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
5046
5047
5048#if defined(IEM_WITHOUT_ASSEMBLY)
5049
5050IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
5051 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
5052{
5053 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
5054 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
5055 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
5056 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
5057 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
5058
5059 uint16_t const fFcw = pFpuState->FCW;
5060 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
5061 bool const fSignIn = pr80Src->s.fSign;
5062
5063 /*
5064 * Deal with normal numbers first.
5065 */
5066 if (RTFLOAT80U_IS_NORMAL(pr80Src))
5067 {
5068 uint64_t uMantissa = pr80Src->s.uMantissa;
5069 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
5070 if ( (uint32_t)iExponent <= 58
5071 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
5072 {
5073 unsigned const cShiftOff = 63 - iExponent;
5074 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5075 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5076 ? RT_BIT_64(cShiftOff - 1)
5077 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5078 ? fRoundingOffMask
5079 : 0;
5080 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
5081
5082 uMantissa >>= cShiftOff;
5083 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
5084 uMantissa += uRounding;
5085 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
5086 {
5087 if (fRoundedOff)
5088 {
5089 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
5090 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
5091 else if (uRounding)
5092 fFsw |= X86_FSW_C1;
5093 fFsw |= X86_FSW_PE;
5094 if (!(fFcw & X86_FCW_PM))
5095 fFsw |= X86_FSW_ES | X86_FSW_B;
5096 }
5097
5098 pd80Dst->s.fSign = fSignIn;
5099 pd80Dst->s.uPad = 0;
5100 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
5101 {
5102 unsigned const uDigits = uMantissa % 100;
5103 uMantissa /= 100;
5104 uint8_t const bLo = uDigits % 10;
5105 uint8_t const bHi = uDigits / 10;
5106 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
5107 }
5108 }
5109 else
5110 {
5111 /* overflowed after rounding. */
5112 fFsw |= X86_FSW_IE;
5113 if (fFcw & X86_FCW_IM)
5114 *pd80Dst = s_d80Indefinite;
5115 else
5116 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5117 }
5118 }
5119 /*
5120 * Tiny sub-zero numbers.
5121 */
5122 else if (iExponent < 0)
5123 {
5124 if (!fSignIn)
5125 {
5126 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5127 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5128 {
5129 *pd80Dst = s_ad80One[fSignIn];
5130 fFsw |= X86_FSW_C1;
5131 }
5132 else
5133 *pd80Dst = s_ad80Zeros[fSignIn];
5134 }
5135 else
5136 {
5137 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5138 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5139 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5140 *pd80Dst = s_ad80Zeros[fSignIn];
5141 else
5142 {
5143 *pd80Dst = s_ad80One[fSignIn];
5144 fFsw |= X86_FSW_C1;
5145 }
5146 }
5147 fFsw |= X86_FSW_PE;
5148 if (!(fFcw & X86_FCW_PM))
5149 fFsw |= X86_FSW_ES | X86_FSW_B;
5150 }
5151 /*
5152 * Too large/small number outside the target integer range.
5153 */
5154 else
5155 {
5156 fFsw |= X86_FSW_IE;
5157 if (fFcw & X86_FCW_IM)
5158 *pd80Dst = s_d80Indefinite;
5159 else
5160 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5161 }
5162 }
5163 /*
5164 * Map both +0 and -0 to integer zero (signless/+).
5165 */
5166 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5167 *pd80Dst = s_ad80Zeros[fSignIn];
5168 /*
5169 * Denormals are just really tiny sub-zero numbers that are either rounded
5170 * to zero, 1 or -1 depending on sign and rounding control.
5171 */
5172 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5173 {
5174 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5175 *pd80Dst = s_ad80Zeros[fSignIn];
5176 else
5177 {
5178 *pd80Dst = s_ad80One[fSignIn];
5179 fFsw |= X86_FSW_C1;
5180 }
5181 fFsw |= X86_FSW_PE;
5182 if (!(fFcw & X86_FCW_PM))
5183 fFsw |= X86_FSW_ES | X86_FSW_B;
5184 }
5185 /*
5186 * All other special values are considered invalid arguments and result
5187 * in an IE exception and indefinite value if masked.
5188 */
5189 else
5190 {
5191 fFsw |= X86_FSW_IE;
5192 if (fFcw & X86_FCW_IM)
5193 *pd80Dst = s_d80Indefinite;
5194 else
5195 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5196 }
5197 *pu16FSW = fFsw;
5198}
5199
5200
5201/*********************************************************************************************************************************
5202* FPU Helpers *
5203*********************************************************************************************************************************/
5204AssertCompileSize(RTFLOAT128U, 16);
5205AssertCompileSize(RTFLOAT80U, 10);
5206AssertCompileSize(RTFLOAT64U, 8);
5207AssertCompileSize(RTFLOAT32U, 4);
5208
5209/**
5210 * Normalizes a possible pseudo-normal value.
5211 *
5212 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5213 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5214 * i.e. changing uExponent from 0 to 1.
5215 *
5216 * This macro will declare a RTFLOAT80U with the name given by
5217 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5218 * a normalization was performed.
5219 *
5220 * @note This must be applied before calling SoftFloat with a value that couldbe
5221 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5222 * correctly.
5223 */
5224#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5225 RTFLOAT80U a_r80ValNormalized; \
5226 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5227 { \
5228 a_r80ValNormalized = *a_pr80Val; \
5229 a_r80ValNormalized.s.uExponent = 1; \
5230 a_pr80Val = &a_r80ValNormalized; \
5231 } else do {} while (0)
5232
5233#ifdef IEM_WITH_FLOAT128_FOR_FPU
5234
5235DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5236{
5237 int fNew;
5238 switch (fFcw & X86_FCW_RC_MASK)
5239 {
5240 default:
5241 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5242 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5243 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5244 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5245 }
5246 int fOld = fegetround();
5247 fesetround(fNew);
5248 return fOld;
5249}
5250
5251
5252DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5253{
5254 fesetround(fOld);
5255}
5256
5257DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5258{
5259 RT_NOREF(fFcw);
5260 RTFLOAT128U Tmp;
5261 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5262 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5263 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5264 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5265 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5266 {
5267 Assert(Tmp.s.uExponent == 0);
5268 Tmp.s2.uSignAndExponent++;
5269 }
5270 return *(_Float128 *)&Tmp;
5271}
5272
5273
5274DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5275{
5276 RT_NOREF(fFcw);
5277 RTFLOAT128U Tmp;
5278 *(_Float128 *)&Tmp = rd128ValSrc;
5279 ASMCompilerBarrier();
5280 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5281 {
5282 pr80Dst->s.fSign = Tmp.s64.fSign;
5283 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5284 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5285 | Tmp.s64.uFractionLo >> (64 - 15);
5286
5287 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5288 unsigned const cShiftOff = 64 - 15;
5289 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5290 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5291 if (uRoundedOff)
5292 {
5293 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5294 ? RT_BIT_64(cShiftOff - 1)
5295 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5296 ? fRoundingOffMask
5297 : 0;
5298 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5299 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5300 || uRoundedOff != uRoundingAdd)
5301 {
5302 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5303 {
5304 uFraction += 1;
5305 if (!(uFraction & RT_BIT_64(63)))
5306 { /* likely */ }
5307 else
5308 {
5309 uFraction >>= 1;
5310 pr80Dst->s.uExponent++;
5311 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5312 return fFsw;
5313 }
5314 fFsw |= X86_FSW_C1;
5315 }
5316 }
5317 fFsw |= X86_FSW_PE;
5318 if (!(fFcw & X86_FCW_PM))
5319 fFsw |= X86_FSW_ES | X86_FSW_B;
5320 }
5321 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5322 }
5323 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5324 {
5325 pr80Dst->s.fSign = Tmp.s64.fSign;
5326 pr80Dst->s.uExponent = 0;
5327 pr80Dst->s.uMantissa = 0;
5328 }
5329 else if (RTFLOAT128U_IS_INF(&Tmp))
5330 {
5331 pr80Dst->s.fSign = Tmp.s64.fSign;
5332 pr80Dst->s.uExponent = 0;
5333 pr80Dst->s.uMantissa = 0;
5334 }
5335 return fFsw;
5336}
5337
5338
5339#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5340
5341/** Initializer for the SoftFloat state structure. */
5342# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5343 { \
5344 softfloat_tininess_afterRounding, \
5345 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5346 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5347 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5348 : (uint8_t)softfloat_round_minMag, \
5349 0, \
5350 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5351 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5352 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5353 }
5354
5355/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5356# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5357 ( (a_fFsw) \
5358 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5359 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5360 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5361 ? X86_FSW_ES | X86_FSW_B : 0) )
5362
5363
5364DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5365{
5366 RT_NOREF(fFcw);
5367 Assert(cBits > 64);
5368# if 0 /* rounding does not seem to help */
5369 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5370 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5371 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5372 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5373 {
5374 uint64_t uOld = r128.v[0];
5375 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5376 if (r128.v[0] < uOld)
5377 r128.v[1] += 1;
5378 }
5379# else
5380 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5381# endif
5382 return r128;
5383}
5384
5385
5386DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5387{
5388 RT_NOREF(fFcw);
5389 Assert(cBits > 64);
5390# if 0 /* rounding does not seem to help, not even on constants */
5391 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5392 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5393 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5394 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5395 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5396 {
5397 uint64_t uOld = r128.v[0];
5398 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5399 if (r128.v[0] < uOld)
5400 r128.v[1] += 1;
5401 }
5402 return r128;
5403# else
5404 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5405 return r128;
5406# endif
5407}
5408
5409
5410# if 0 /* unused */
5411DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5412{
5413 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5414 return r128;
5415}
5416# endif
5417
5418
5419/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5420DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5421{
5422 extFloat80_t Tmp;
5423 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5424 Tmp.signif = pr80Val->s2.uMantissa;
5425 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5426 return extF80_to_f128(Tmp, &Ignored);
5427}
5428
5429
5430/**
5431 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5432 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5433 *
5434 * This is only a structure format conversion, nothing else.
5435 */
5436DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5437{
5438 extFloat80_t Tmp;
5439 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5440 Tmp.signif = pr80Val->s2.uMantissa;
5441 return Tmp;
5442}
5443
5444
5445/**
5446 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5447 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5448 *
5449 * This is only a structure format conversion, nothing else.
5450 */
5451DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5452{
5453 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5454 pr80Dst->s2.uMantissa = r80XSrc.signif;
5455 return pr80Dst;
5456}
5457
5458
5459DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5460{
5461 RT_NOREF(fFcw);
5462 RTFLOAT128U Tmp;
5463 *(float128_t *)&Tmp = r128Src;
5464 ASMCompilerBarrier();
5465
5466 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5467 {
5468 pr80Dst->s.fSign = Tmp.s64.fSign;
5469 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5470 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5471 | Tmp.s64.uFractionLo >> (64 - 15);
5472
5473 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5474 unsigned const cShiftOff = 64 - 15;
5475 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5476 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5477 if (uRoundedOff)
5478 {
5479 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5480 ? RT_BIT_64(cShiftOff - 1)
5481 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5482 ? fRoundingOffMask
5483 : 0;
5484 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5485 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5486 || uRoundedOff != uRoundingAdd)
5487 {
5488 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5489 {
5490 uFraction += 1;
5491 if (!(uFraction & RT_BIT_64(63)))
5492 { /* likely */ }
5493 else
5494 {
5495 uFraction >>= 1;
5496 pr80Dst->s.uExponent++;
5497 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5498 return fFsw;
5499 }
5500 fFsw |= X86_FSW_C1;
5501 }
5502 }
5503 fFsw |= X86_FSW_PE;
5504 if (!(fFcw & X86_FCW_PM))
5505 fFsw |= X86_FSW_ES | X86_FSW_B;
5506 }
5507
5508 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5509 }
5510 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5511 {
5512 pr80Dst->s.fSign = Tmp.s64.fSign;
5513 pr80Dst->s.uExponent = 0;
5514 pr80Dst->s.uMantissa = 0;
5515 }
5516 else if (RTFLOAT128U_IS_INF(&Tmp))
5517 {
5518 pr80Dst->s.fSign = Tmp.s64.fSign;
5519 pr80Dst->s.uExponent = 0x7fff;
5520 pr80Dst->s.uMantissa = 0;
5521 }
5522 return fFsw;
5523}
5524
5525
5526/**
5527 * Helper for transfering exception and C1 to FSW and setting the result value
5528 * accordingly.
5529 *
5530 * @returns Updated FSW.
5531 * @param pSoftState The SoftFloat state following the operation.
5532 * @param r80XResult The result of the SoftFloat operation.
5533 * @param pr80Result Where to store the result for IEM.
5534 * @param fFcw The FPU control word.
5535 * @param fFsw The FSW before the operation, with necessary bits
5536 * cleared and such.
5537 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5538 * raised.
5539 */
5540DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5541 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5542 PCRTFLOAT80U pr80XcptResult)
5543{
5544 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5545 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5546 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5547 fFsw |= X86_FSW_ES | X86_FSW_B;
5548
5549 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5550 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5551 else
5552 {
5553 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5554 *pr80Result = *pr80XcptResult;
5555 }
5556 return fFsw;
5557}
5558
5559
5560/**
5561 * Helper doing polynomial evaluation using Horner's method.
5562 *
5563 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5564 */
5565float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5566 unsigned cPrecision, softfloat_state_t *pSoftState)
5567{
5568 Assert(cHornerConsts > 1);
5569 size_t i = cHornerConsts - 1;
5570 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5571 while (i-- > 0)
5572 {
5573 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5574 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5575 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5576 }
5577 return r128Result;
5578}
5579
5580#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5581
5582
5583/**
5584 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5585 * mantissa, exponent and sign.
5586 *
5587 * @returns Updated FSW.
5588 * @param pr80Dst Where to return the composed value.
5589 * @param fSign The sign.
5590 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5591 * ignored and should be zero. This will probably be
5592 * modified during normalization and rounding.
5593 * @param iExponent Unbiased exponent.
5594 * @param fFcw The FPU control word.
5595 * @param fFsw The FPU status word.
5596 */
5597static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5598 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5599{
5600 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5601
5602 iExponent += RTFLOAT80U_EXP_BIAS;
5603
5604 /* Do normalization if necessary and possible. */
5605 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5606 {
5607 int cShift = 192 - RTUInt256BitCount(puMantissa);
5608 if (iExponent > cShift)
5609 iExponent -= cShift;
5610 else
5611 {
5612 if (fFcw & X86_FCW_UM)
5613 {
5614 if (iExponent > 0)
5615 cShift = --iExponent;
5616 else
5617 cShift = 0;
5618 }
5619 iExponent -= cShift;
5620 }
5621 RTUInt256AssignShiftLeft(puMantissa, cShift);
5622 }
5623
5624 /* Do rounding. */
5625 uint64_t uMantissa = puMantissa->QWords.qw2;
5626 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5627 {
5628 bool fAdd;
5629 switch (fFcw & X86_FCW_RC_MASK)
5630 {
5631 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5632 case X86_FCW_RC_NEAREST:
5633 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5634 {
5635 if ( (uMantissa & 1)
5636 || puMantissa->QWords.qw0 != 0
5637 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5638 {
5639 fAdd = true;
5640 break;
5641 }
5642 uMantissa &= ~(uint64_t)1;
5643 }
5644 fAdd = false;
5645 break;
5646 case X86_FCW_RC_ZERO:
5647 fAdd = false;
5648 break;
5649 case X86_FCW_RC_UP:
5650 fAdd = !fSign;
5651 break;
5652 case X86_FCW_RC_DOWN:
5653 fAdd = fSign;
5654 break;
5655 }
5656 if (fAdd)
5657 {
5658 uint64_t const uTmp = uMantissa;
5659 uMantissa = uTmp + 1;
5660 if (uMantissa < uTmp)
5661 {
5662 uMantissa >>= 1;
5663 uMantissa |= RT_BIT_64(63);
5664 iExponent++;
5665 }
5666 fFsw |= X86_FSW_C1;
5667 }
5668 fFsw |= X86_FSW_PE;
5669 if (!(fFcw & X86_FCW_PM))
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672
5673 /* Check for underflow (denormals). */
5674 if (iExponent <= 0)
5675 {
5676 if (fFcw & X86_FCW_UM)
5677 {
5678 if (uMantissa & RT_BIT_64(63))
5679 uMantissa >>= 1;
5680 iExponent = 0;
5681 }
5682 else
5683 {
5684 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_UE;
5688 }
5689 /* Check for overflow */
5690 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5691 {
5692 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5693 }
5694
5695 /* Compose the result. */
5696 pr80Dst->s.uMantissa = uMantissa;
5697 pr80Dst->s.uExponent = iExponent;
5698 pr80Dst->s.fSign = fSign;
5699 return fFsw;
5700}
5701
5702
5703/**
5704 * See also iemAImpl_fld_r80_from_r32
5705 */
5706static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5707{
5708 uint16_t fFsw = 0;
5709 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5710 {
5711 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5712 pr80Dst->sj64.fInteger = 1;
5713 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5715 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5716 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5717 }
5718 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5719 {
5720 pr80Dst->s.fSign = pr32Val->s.fSign;
5721 pr80Dst->s.uExponent = 0;
5722 pr80Dst->s.uMantissa = 0;
5723 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5724 }
5725 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5726 {
5727 /* Subnormal -> normalized + X86_FSW_DE return. */
5728 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5729 pr80Dst->sj64.fInteger = 1;
5730 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5731 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5732 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5733 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5734 fFsw = X86_FSW_DE;
5735 }
5736 else if (RTFLOAT32U_IS_INF(pr32Val))
5737 {
5738 pr80Dst->s.fSign = pr32Val->s.fSign;
5739 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5740 pr80Dst->s.uMantissa = RT_BIT_64(63);
5741 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5742 }
5743 else
5744 {
5745 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5746 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5747 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5748 pr80Dst->sj64.fInteger = 1;
5749 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5750 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5751 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5752 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5753 }
5754 return fFsw;
5755}
5756
5757
5758/**
5759 * See also iemAImpl_fld_r80_from_r64
5760 */
5761static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5762{
5763 uint16_t fFsw = 0;
5764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5765 {
5766 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5767 pr80Dst->sj64.fInteger = 1;
5768 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5769 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5770 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5771 }
5772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5773 {
5774 pr80Dst->s.fSign = pr64Val->s.fSign;
5775 pr80Dst->s.uExponent = 0;
5776 pr80Dst->s.uMantissa = 0;
5777 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5778 }
5779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5780 {
5781 /* Subnormal values gets normalized. */
5782 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5783 pr80Dst->sj64.fInteger = 1;
5784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5785 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5787 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5788 fFsw = X86_FSW_DE;
5789 }
5790 else if (RTFLOAT64U_IS_INF(pr64Val))
5791 {
5792 pr80Dst->s.fSign = pr64Val->s.fSign;
5793 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5794 pr80Dst->s.uMantissa = RT_BIT_64(63);
5795 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5796 }
5797 else
5798 {
5799 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5800 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5801 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5802 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5803 pr80Dst->sj64.fInteger = 1;
5804 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5805 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5806 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5807 }
5808 return fFsw;
5809}
5810
5811
5812/**
5813 * See also EMIT_FILD.
5814 */
5815#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5816static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5817{ \
5818 if (iVal == 0) \
5819 { \
5820 pr80Dst->s.fSign = 0; \
5821 pr80Dst->s.uExponent = 0; \
5822 pr80Dst->s.uMantissa = 0; \
5823 } \
5824 else \
5825 { \
5826 if (iVal > 0) \
5827 pr80Dst->s.fSign = 0; \
5828 else \
5829 { \
5830 pr80Dst->s.fSign = 1; \
5831 iVal = -iVal; \
5832 } \
5833 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5834 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5835 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5836 } \
5837 return pr80Dst; \
5838}
5839EMIT_CONVERT_IXX_TO_R80(16)
5840EMIT_CONVERT_IXX_TO_R80(32)
5841//EMIT_CONVERT_IXX_TO_R80(64)
5842
5843/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5844#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5845IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5846{ \
5847 RTFLOAT80U r80Val2; \
5848 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5849 Assert(!fFsw || fFsw == X86_FSW_DE); \
5850 if (fFsw) \
5851 { \
5852 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5853 fFsw = 0; \
5854 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5855 { \
5856 pFpuRes->r80Result = *pr80Val1; \
5857 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5858 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5859 return; \
5860 } \
5861 } \
5862 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5863 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5864}
5865
5866/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5867#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5868IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5869{ \
5870 RTFLOAT80U r80Val2; \
5871 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5872 Assert(!fFsw || fFsw == X86_FSW_DE); \
5873 if (fFsw) \
5874 { \
5875 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5876 fFsw = 0; \
5877 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5878 { \
5879 pFpuRes->r80Result = *pr80Val1; \
5880 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5881 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5882 return; \
5883 } \
5884 } \
5885 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5886 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5887}
5888
5889/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5890#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5891IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5892{ \
5893 RTFLOAT80U r80Val2; \
5894 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5895 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5896}
5897
5898/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5899#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5900IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5901{ \
5902 RTFLOAT80U r80Val2; \
5903 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5904 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5905}
5906
5907
5908
5909/*********************************************************************************************************************************
5910* x86 FPU Division Operations *
5911*********************************************************************************************************************************/
5912
5913/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5914static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5915 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5916{
5917 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5918 {
5919 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5920 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5921 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5922 }
5923 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5924 { /* Div by zero. */
5925 if (fFcw & X86_FCW_ZM)
5926 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5927 else
5928 {
5929 *pr80Result = *pr80Val1Org;
5930 fFsw |= X86_FSW_ES | X86_FSW_B;
5931 }
5932 fFsw |= X86_FSW_ZE;
5933 }
5934 else
5935 { /* Invalid operand */
5936 if (fFcw & X86_FCW_IM)
5937 *pr80Result = g_r80Indefinite;
5938 else
5939 {
5940 *pr80Result = *pr80Val1Org;
5941 fFsw |= X86_FSW_ES | X86_FSW_B;
5942 }
5943 fFsw |= X86_FSW_IE;
5944 }
5945 return fFsw;
5946}
5947
5948
5949IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5950 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5951{
5952 uint16_t const fFcw = pFpuState->FCW;
5953 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5954
5955 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5956 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5957 {
5958 if (fFcw & X86_FCW_IM)
5959 pFpuRes->r80Result = g_r80Indefinite;
5960 else
5961 {
5962 pFpuRes->r80Result = *pr80Val1;
5963 fFsw |= X86_FSW_ES | X86_FSW_B;
5964 }
5965 fFsw |= X86_FSW_IE;
5966 }
5967 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5968 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5969 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5970 {
5971 if (fFcw & X86_FCW_DM)
5972 {
5973 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5974 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5975 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5976 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5977 }
5978 else
5979 {
5980 pFpuRes->r80Result = *pr80Val1;
5981 fFsw |= X86_FSW_ES | X86_FSW_B;
5982 }
5983 fFsw |= X86_FSW_DE;
5984 }
5985 /* SoftFloat can handle the rest: */
5986 else
5987 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5988
5989 pFpuRes->FSW = fFsw;
5990}
5991
5992
5993EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5994EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5995EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5996EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5997
5998
5999IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6000 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6001{
6002 uint16_t const fFcw = pFpuState->FCW;
6003 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6004
6005 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6006 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6007 {
6008 if (fFcw & X86_FCW_IM)
6009 pFpuRes->r80Result = g_r80Indefinite;
6010 else
6011 {
6012 pFpuRes->r80Result = *pr80Val1;
6013 fFsw |= X86_FSW_ES | X86_FSW_B;
6014 }
6015 fFsw |= X86_FSW_IE;
6016 }
6017 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6018 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6019 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
6020 {
6021 if (fFcw & X86_FCW_DM)
6022 {
6023 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6024 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6025 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6026 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6027 }
6028 else
6029 {
6030 pFpuRes->r80Result = *pr80Val1;
6031 fFsw |= X86_FSW_ES | X86_FSW_B;
6032 }
6033 fFsw |= X86_FSW_DE;
6034 }
6035 /* SoftFloat can handle the rest: */
6036 else
6037 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6038
6039 pFpuRes->FSW = fFsw;
6040}
6041
6042
6043EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6044EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6045EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
6046EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
6047
6048
6049/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
6050static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6051 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
6052{
6053 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
6054 {
6055 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6056 uint16_t fCxFlags = 0;
6057 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
6058 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
6059 &fCxFlags, &SoftState);
6060 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
6061 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6062 if ( !(fFsw & X86_FSW_IE)
6063 && !RTFLOAT80U_IS_NAN(pr80Result)
6064 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
6065 {
6066 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
6067 fFsw |= fCxFlags & X86_FSW_C_MASK;
6068 }
6069 return fFsw;
6070 }
6071
6072 /* Invalid operand */
6073 if (fFcw & X86_FCW_IM)
6074 *pr80Result = g_r80Indefinite;
6075 else
6076 {
6077 *pr80Result = *pr80Val1Org;
6078 fFsw |= X86_FSW_ES | X86_FSW_B;
6079 }
6080 return fFsw | X86_FSW_IE;
6081}
6082
6083
6084static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6085 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
6086{
6087 uint16_t const fFcw = pFpuState->FCW;
6088 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6089
6090 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
6091 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
6092 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
6093 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
6094 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
6095 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
6096 {
6097 if (fFcw & X86_FCW_IM)
6098 pFpuRes->r80Result = g_r80Indefinite;
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_IE;
6105 }
6106 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6107 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
6108 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
6109 {
6110 if (fFcw & X86_FCW_DM)
6111 {
6112 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6113 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6114 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6115 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6116 pr80Val1Org, fLegacyInstr);
6117 }
6118 else
6119 {
6120 pFpuRes->r80Result = *pr80Val1;
6121 fFsw |= X86_FSW_ES | X86_FSW_B;
6122 }
6123 fFsw |= X86_FSW_DE;
6124 }
6125 /* SoftFloat can handle the rest: */
6126 else
6127 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6128 pr80Val1, fLegacyInstr);
6129
6130 pFpuRes->FSW = fFsw;
6131}
6132
6133
6134IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6135 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6136{
6137 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6138}
6139
6140
6141IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6142 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6143{
6144 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6145}
6146
6147
6148/*********************************************************************************************************************************
6149* x87 FPU Multiplication Operations *
6150*********************************************************************************************************************************/
6151
6152/** Worker for iemAImpl_fmul_r80_by_r80. */
6153static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6154 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6155{
6156 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6157 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6158 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6159}
6160
6161
6162IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6163 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6164{
6165 uint16_t const fFcw = pFpuState->FCW;
6166 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6167
6168 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6169 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6170 {
6171 if (fFcw & X86_FCW_IM)
6172 pFpuRes->r80Result = g_r80Indefinite;
6173 else
6174 {
6175 pFpuRes->r80Result = *pr80Val1;
6176 fFsw |= X86_FSW_ES | X86_FSW_B;
6177 }
6178 fFsw |= X86_FSW_IE;
6179 }
6180 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6181 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6182 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6183 {
6184 if (fFcw & X86_FCW_DM)
6185 {
6186 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6187 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6188 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6189 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6190 }
6191 else
6192 {
6193 pFpuRes->r80Result = *pr80Val1;
6194 fFsw |= X86_FSW_ES | X86_FSW_B;
6195 }
6196 fFsw |= X86_FSW_DE;
6197 }
6198 /* SoftFloat can handle the rest: */
6199 else
6200 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6201
6202 pFpuRes->FSW = fFsw;
6203}
6204
6205
6206EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6207EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6208EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6209EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6210
6211
6212/*********************************************************************************************************************************
6213* x87 FPU Addition *
6214*********************************************************************************************************************************/
6215
6216/** Worker for iemAImpl_fadd_r80_by_r80. */
6217static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6218 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6219{
6220 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6221 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6222 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6223}
6224
6225
6226IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6227 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6228{
6229 uint16_t const fFcw = pFpuState->FCW;
6230 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6231
6232 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6233 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6234 {
6235 if (fFcw & X86_FCW_IM)
6236 pFpuRes->r80Result = g_r80Indefinite;
6237 else
6238 {
6239 pFpuRes->r80Result = *pr80Val1;
6240 fFsw |= X86_FSW_ES | X86_FSW_B;
6241 }
6242 fFsw |= X86_FSW_IE;
6243 }
6244 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6245 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6246 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6247 {
6248 if (fFcw & X86_FCW_DM)
6249 {
6250 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6251 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6252 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6253 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6254 }
6255 else
6256 {
6257 pFpuRes->r80Result = *pr80Val1;
6258 fFsw |= X86_FSW_ES | X86_FSW_B;
6259 }
6260 fFsw |= X86_FSW_DE;
6261 }
6262 /* SoftFloat can handle the rest: */
6263 else
6264 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6265
6266 pFpuRes->FSW = fFsw;
6267}
6268
6269
6270EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6271EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6272EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6273EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6274
6275
6276/*********************************************************************************************************************************
6277* x87 FPU Subtraction *
6278*********************************************************************************************************************************/
6279
6280/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6281static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6282 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6283{
6284 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6285 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6286 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6287}
6288
6289
6290IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6291 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6292{
6293 uint16_t const fFcw = pFpuState->FCW;
6294 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6295
6296 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6297 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6298 {
6299 if (fFcw & X86_FCW_IM)
6300 pFpuRes->r80Result = g_r80Indefinite;
6301 else
6302 {
6303 pFpuRes->r80Result = *pr80Val1;
6304 fFsw |= X86_FSW_ES | X86_FSW_B;
6305 }
6306 fFsw |= X86_FSW_IE;
6307 }
6308 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6309 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6310 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6311 {
6312 if (fFcw & X86_FCW_DM)
6313 {
6314 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6315 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6316 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6317 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6318 }
6319 else
6320 {
6321 pFpuRes->r80Result = *pr80Val1;
6322 fFsw |= X86_FSW_ES | X86_FSW_B;
6323 }
6324 fFsw |= X86_FSW_DE;
6325 }
6326 /* SoftFloat can handle the rest: */
6327 else
6328 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6329
6330 pFpuRes->FSW = fFsw;
6331}
6332
6333
6334EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6335EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6336EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6337EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6338
6339
6340/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6341IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6342 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6343{
6344 uint16_t const fFcw = pFpuState->FCW;
6345 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6346
6347 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6348 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6349 {
6350 if (fFcw & X86_FCW_IM)
6351 pFpuRes->r80Result = g_r80Indefinite;
6352 else
6353 {
6354 pFpuRes->r80Result = *pr80Val1;
6355 fFsw |= X86_FSW_ES | X86_FSW_B;
6356 }
6357 fFsw |= X86_FSW_IE;
6358 }
6359 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6360 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6361 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6362 {
6363 if (fFcw & X86_FCW_DM)
6364 {
6365 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6366 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6367 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6368 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6369 }
6370 else
6371 {
6372 pFpuRes->r80Result = *pr80Val1;
6373 fFsw |= X86_FSW_ES | X86_FSW_B;
6374 }
6375 fFsw |= X86_FSW_DE;
6376 }
6377 /* SoftFloat can handle the rest: */
6378 else
6379 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6380
6381 pFpuRes->FSW = fFsw;
6382}
6383
6384
6385EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6386EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6387EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6388EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6389
6390
6391/*********************************************************************************************************************************
6392* x87 FPU Trigometric Operations *
6393*********************************************************************************************************************************/
6394static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6395{
6396 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6397 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6398 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6399 extFloat80_t v;
6400 (void)fFcw;
6401
6402 v = extF80_atan2(y, x, &SoftState);
6403
6404 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6405 return fFsw;
6406}
6407
6408IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6409 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6410{
6411 uint16_t const fFcw = pFpuState->FCW;
6412 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6413
6414 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6415 {
6416 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6417
6418 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6419 if (!(fFcw & X86_FCW_PM))
6420 fFsw |= X86_FSW_ES | X86_FSW_B;
6421 }
6422 else
6423 {
6424 fFsw |= X86_FSW_IE;
6425 if (!(fFcw & X86_FCW_IM))
6426 {
6427 pFpuRes->r80Result = *pr80Val2;
6428 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6429 }
6430 else
6431 {
6432 pFpuRes->r80Result = g_r80Indefinite;
6433 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6434 }
6435 }
6436
6437 pFpuRes->FSW = fFsw;
6438}
6439#endif /* IEM_WITHOUT_ASSEMBLY */
6440
6441IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6442 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6443{
6444 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6445}
6446
6447IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6448 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6449{
6450 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6451}
6452
6453
6454#if defined(IEM_WITHOUT_ASSEMBLY)
6455static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6456{
6457 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6458 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6459 extFloat80_t v;
6460 (void)fFcw;
6461
6462 v = extF80_tan(x, &SoftState);
6463
6464 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6465 return fFsw;
6466}
6467
6468IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6469{
6470 uint16_t const fFcw = pFpuState->FCW;
6471 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6472
6473 if (RTFLOAT80U_IS_ZERO(pr80Val))
6474 {
6475 pFpuResTwo->r80Result1 = *pr80Val;
6476 pFpuResTwo->r80Result2 = g_ar80One[0];
6477 }
6478 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6479 {
6480 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6481 {
6482 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6483 pFpuResTwo->r80Result1 = *pr80Val;
6484 }
6485 else
6486 {
6487 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6488 {
6489 pFpuResTwo->r80Result1 = *pr80Val;
6490 }
6491 else
6492 {
6493 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6494 }
6495
6496 pFpuResTwo->r80Result2 = g_ar80One[0];
6497
6498 fFsw |= X86_FSW_PE;
6499 if (!(fFcw & X86_FCW_PM))
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 fFsw |= X86_FSW_IE;
6506 if (!(fFcw & X86_FCW_IM))
6507 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6508 }
6509
6510 pFpuResTwo->FSW = fFsw;
6511}
6512#endif /* IEM_WITHOUT_ASSEMBLY */
6513
6514IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6515{
6516 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6517}
6518
6519IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6520{
6521 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6522}
6523
6524#ifdef IEM_WITHOUT_ASSEMBLY
6525
6526static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6527{
6528 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6529 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6530 extFloat80_t v;
6531 (void)fFcw;
6532
6533 v = extF80_sin(x, &SoftState);
6534
6535 iemFpuSoftF80ToIprt(pr80Result, v);
6536
6537 return fFsw;
6538}
6539
6540IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6541{
6542 uint16_t const fFcw = pFpuState->FCW;
6543 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6544
6545 if (RTFLOAT80U_IS_ZERO(pr80Val))
6546 {
6547 pFpuRes->r80Result = *pr80Val;
6548 }
6549 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6550 {
6551 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6552 {
6553 fFsw |= X86_FSW_C2;
6554 pFpuRes->r80Result = *pr80Val;
6555 }
6556 else
6557 {
6558 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6559 {
6560 pFpuRes->r80Result = *pr80Val;
6561 }
6562 else
6563 {
6564 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6565 }
6566 fFsw |= X86_FSW_PE;
6567 if (!(fFcw & X86_FCW_PM))
6568 fFsw |= X86_FSW_ES | X86_FSW_B;
6569 }
6570 }
6571 else if (RTFLOAT80U_IS_INF(pr80Val))
6572 {
6573 fFsw |= X86_FSW_IE;
6574 if (!(fFcw & X86_FCW_IM))
6575 {
6576 fFsw |= X86_FSW_ES | X86_FSW_B;
6577 pFpuRes->r80Result = *pr80Val;
6578 }
6579 else
6580 {
6581 pFpuRes->r80Result = g_r80Indefinite;
6582 }
6583 }
6584 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6585 {
6586 fFsw |= X86_FSW_DE;
6587
6588 if (fFcw & X86_FCW_DM)
6589 {
6590 if (fFcw & X86_FCW_UM)
6591 {
6592 pFpuRes->r80Result = *pr80Val;
6593 }
6594 else
6595 {
6596 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6597 uint64_t uMantissa = pr80Val->s.uMantissa;
6598 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6599
6600 uExponent = 64 - uExponent;
6601 uMantissa <<= uExponent;
6602 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6603
6604 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6605 pFpuRes->r80Result.s.uMantissa = uMantissa;
6606 pFpuRes->r80Result.s.uExponent = uExponent;
6607 }
6608
6609 fFsw |= X86_FSW_UE | X86_FSW_PE;
6610
6611 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6612 {
6613 /* All the exceptions are masked. */
6614 }
6615 else
6616 {
6617 fFsw |= X86_FSW_ES | X86_FSW_B;
6618 }
6619 }
6620 else
6621 {
6622 pFpuRes->r80Result = *pr80Val;
6623
6624 fFsw |= X86_FSW_ES | X86_FSW_B;
6625 }
6626 }
6627 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6628 {
6629 pFpuRes->r80Result = *pr80Val;
6630 fFsw |= X86_FSW_DE;
6631
6632 if (fFcw & X86_FCW_DM)
6633 {
6634 if (fFcw & X86_FCW_PM)
6635 {
6636 fFsw |= X86_FSW_PE;
6637 }
6638 else
6639 {
6640 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6641 }
6642
6643 pFpuRes->r80Result.sj64.uExponent = 1;
6644 }
6645 else
6646 {
6647 fFsw |= X86_FSW_ES | X86_FSW_B;
6648 }
6649 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6650 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6651 {
6652 pFpuRes->r80Result = *pr80Val;
6653 } else {
6654 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6655 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6656 && (fFcw & X86_FCW_IM))
6657 pFpuRes->r80Result = g_r80Indefinite;
6658 else
6659 {
6660 pFpuRes->r80Result = *pr80Val;
6661 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6662 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6663 }
6664
6665 fFsw |= X86_FSW_IE;
6666 if (!(fFcw & X86_FCW_IM))
6667 fFsw |= X86_FSW_ES | X86_FSW_B;
6668 }
6669
6670 pFpuRes->FSW = fFsw;
6671}
6672#endif /* IEM_WITHOUT_ASSEMBLY */
6673
6674IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6675{
6676 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6677}
6678
6679IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6680{
6681 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6682}
6683
6684#ifdef IEM_WITHOUT_ASSEMBLY
6685
6686static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6687{
6688 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6689 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6690 extFloat80_t v;
6691 (void)fFcw;
6692
6693 v = extF80_cos(x, &SoftState);
6694
6695 iemFpuSoftF80ToIprt(pr80Result, v);
6696
6697 return fFsw;
6698}
6699
6700IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6701{
6702 uint16_t const fFcw = pFpuState->FCW;
6703 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6704
6705 if (RTFLOAT80U_IS_ZERO(pr80Val))
6706 {
6707 pFpuRes->r80Result = g_ar80One[0];
6708 }
6709 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6710 {
6711 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6712 {
6713 fFsw |= X86_FSW_C2;
6714 pFpuRes->r80Result = *pr80Val;
6715 }
6716 else
6717 {
6718 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6719 {
6720 pFpuRes->r80Result = g_ar80One[0];
6721
6722 }
6723 else
6724 {
6725 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6726 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6727 }
6728 fFsw |= X86_FSW_PE;
6729 if (!(fFcw & X86_FCW_PM))
6730 fFsw |= X86_FSW_ES | X86_FSW_B;
6731 }
6732 }
6733 else if (RTFLOAT80U_IS_INF(pr80Val))
6734 {
6735 fFsw |= X86_FSW_IE;
6736 if (!(fFcw & X86_FCW_IM))
6737 {
6738 fFsw |= X86_FSW_ES | X86_FSW_B;
6739 pFpuRes->r80Result = *pr80Val;
6740 }
6741 else
6742 {
6743 pFpuRes->r80Result = g_r80Indefinite;
6744 }
6745 }
6746 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6747 {
6748 fFsw |= X86_FSW_DE;
6749
6750 if (fFcw & X86_FCW_DM)
6751 {
6752 pFpuRes->r80Result = g_ar80One[0];
6753
6754 if (fFcw & X86_FCW_PM)
6755 {
6756 fFsw |= X86_FSW_PE;
6757 }
6758 else
6759 {
6760 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6761 }
6762 }
6763 else
6764 {
6765 pFpuRes->r80Result = *pr80Val;
6766 fFsw |= X86_FSW_ES | X86_FSW_B;
6767 }
6768 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6769 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6770 {
6771 pFpuRes->r80Result = *pr80Val;
6772 } else {
6773 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6774 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6775 && (fFcw & X86_FCW_IM))
6776 pFpuRes->r80Result = g_r80Indefinite;
6777 else
6778 {
6779 pFpuRes->r80Result = *pr80Val;
6780 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6781 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6782 }
6783
6784 fFsw |= X86_FSW_IE;
6785 if (!(fFcw & X86_FCW_IM))
6786 fFsw |= X86_FSW_ES | X86_FSW_B;
6787 }
6788
6789 pFpuRes->FSW = fFsw;
6790}
6791#endif /* IEM_WITHOUT_ASSEMBLY */
6792
6793IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6794{
6795 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6796}
6797
6798IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6799{
6800 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6801}
6802
6803#ifdef IEM_WITHOUT_ASSEMBLY
6804
6805static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6806{
6807 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6808 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6809 extFloat80_t r80Sin, r80Cos;
6810 (void)fFcw;
6811
6812 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6813
6814 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6815 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6816
6817 return fFsw;
6818}
6819
6820IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6821{
6822 uint16_t const fFcw = pFpuState->FCW;
6823 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6824
6825 if (RTFLOAT80U_IS_ZERO(pr80Val))
6826 {
6827 pFpuResTwo->r80Result1 = *pr80Val;
6828 pFpuResTwo->r80Result2 = g_ar80One[0];
6829 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6830 }
6831 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6832 {
6833 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6834 {
6835 fFsw |= X86_FSW_C2;
6836
6837 if (fFcw & X86_FCW_IM)
6838 {
6839 pFpuResTwo->r80Result1 = g_r80Indefinite;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6844 }
6845
6846 pFpuResTwo->r80Result2 = *pr80Val;
6847 }
6848 else
6849 {
6850 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6851
6852 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6853 {
6854 pFpuResTwo->r80Result1 = *pr80Val;
6855 pFpuResTwo->r80Result2 = g_ar80One[0];
6856 }
6857 else
6858 {
6859 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6860 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6861 }
6862 fFsw |= X86_FSW_PE;
6863 if (!(fFcw & X86_FCW_PM))
6864 fFsw |= X86_FSW_ES | X86_FSW_B;
6865 }
6866 }
6867 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6868 {
6869 fFsw |= X86_FSW_DE;
6870
6871 if (fFcw & X86_FCW_DM)
6872 {
6873 pFpuResTwo->r80Result1 = *pr80Val;
6874 pFpuResTwo->r80Result2 = g_ar80One[0];
6875 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6876
6877 if (fFcw & X86_FCW_PM)
6878 {
6879 fFsw |= X86_FSW_PE;
6880 }
6881 else
6882 {
6883 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6884 }
6885
6886 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6887 }
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6891 pFpuResTwo->r80Result2 = *pr80Val;
6892 fFsw |= X86_FSW_ES | X86_FSW_B;
6893 }
6894 }
6895 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6896 {
6897 fFsw |= X86_FSW_DE;
6898
6899 if (fFcw & X86_FCW_DM)
6900 {
6901 pFpuResTwo->r80Result2 = g_ar80One[0];
6902
6903 if (fFcw & X86_FCW_UM)
6904 {
6905 pFpuResTwo->r80Result1 = *pr80Val;
6906 }
6907 else
6908 {
6909 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6910 uint64_t uMantissa = pr80Val->s.uMantissa;
6911 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6912
6913 uExponent = 64 - uExponent;
6914 uMantissa <<= uExponent;
6915 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6916
6917 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6918 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6919 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6920 }
6921
6922 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6923 fFsw |= X86_FSW_UE | X86_FSW_PE;
6924
6925 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6926 {
6927 /* All the exceptions are masked. */
6928 }
6929 else
6930 {
6931 fFsw |= X86_FSW_ES | X86_FSW_B;
6932 }
6933 }
6934 else
6935 {
6936 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6937 pFpuResTwo->r80Result2 = *pr80Val;
6938 fFsw |= X86_FSW_ES | X86_FSW_B;
6939 }
6940 }
6941 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6942 {
6943 pFpuResTwo->r80Result1 = *pr80Val;
6944 pFpuResTwo->r80Result2 = *pr80Val;
6945 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6946 }
6947 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6948 {
6949 if (fFcw & X86_FCW_IM)
6950 {
6951 pFpuResTwo->r80Result1 = g_r80Indefinite;
6952 pFpuResTwo->r80Result2 = g_r80Indefinite;
6953 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6954 }
6955 else
6956 {
6957 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6958 pFpuResTwo->r80Result2 = *pr80Val;
6959 }
6960
6961 fFsw |= X86_FSW_IE;
6962 if (!(fFcw & X86_FCW_IM))
6963 fFsw |= X86_FSW_ES | X86_FSW_B;
6964 }
6965 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6966 {
6967 pFpuResTwo->r80Result1 = *pr80Val;
6968 pFpuResTwo->r80Result2 = *pr80Val;
6969
6970 if (fFcw & X86_FCW_IM)
6971 {
6972 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6973 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6974 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6975 }
6976 else
6977 {
6978 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6979 pFpuResTwo->r80Result2 = *pr80Val;
6980 }
6981
6982 fFsw |= X86_FSW_IE;
6983 if (!(fFcw & X86_FCW_IM))
6984 fFsw |= X86_FSW_ES | X86_FSW_B;
6985 }
6986 else if (RTFLOAT80U_IS_INF(pr80Val))
6987 {
6988 if (fFcw & X86_FCW_IM)
6989 {
6990 pFpuResTwo->r80Result1 = g_r80Indefinite;
6991 pFpuResTwo->r80Result2 = g_r80Indefinite;
6992 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6993 }
6994 else
6995 {
6996 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6997 pFpuResTwo->r80Result2 = *pr80Val;
6998 }
6999
7000 fFsw |= X86_FSW_IE;
7001 if (!(fFcw & X86_FCW_IM))
7002 fFsw |= X86_FSW_ES | X86_FSW_B;
7003 }
7004
7005 pFpuResTwo->FSW = fFsw;
7006}
7007#endif /* IEM_WITHOUT_ASSEMBLY */
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7010{
7011 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
7012}
7013
7014IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7015{
7016 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
7017}
7018
7019#ifdef IEM_WITHOUT_ASSEMBLY
7020
7021
7022/*********************************************************************************************************************************
7023* x87 FPU Compare and Testing Operations *
7024*********************************************************************************************************************************/
7025
7026IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7027{
7028 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7029
7030 if (RTFLOAT80U_IS_ZERO(pr80Val))
7031 fFsw |= X86_FSW_C3;
7032 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
7033 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
7034 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7035 {
7036 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
7037 if (!(pFpuState->FCW & X86_FCW_DM))
7038 fFsw |= X86_FSW_ES | X86_FSW_B;
7039 }
7040 else
7041 {
7042 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7043 if (!(pFpuState->FCW & X86_FCW_IM))
7044 fFsw |= X86_FSW_ES | X86_FSW_B;
7045 }
7046
7047 *pu16Fsw = fFsw;
7048}
7049
7050
7051IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7052{
7053 RT_NOREF(pFpuState);
7054 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7055
7056 /* C1 = sign bit (always, even if empty Intel says). */
7057 if (pr80Val->s.fSign)
7058 fFsw |= X86_FSW_C1;
7059
7060 /* Classify the value in C0, C2, C3. */
7061 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
7062 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
7063 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
7064 fFsw |= X86_FSW_C2;
7065 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7066 fFsw |= X86_FSW_C3;
7067 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
7068 fFsw |= X86_FSW_C0;
7069 else if (RTFLOAT80U_IS_INF(pr80Val))
7070 fFsw |= X86_FSW_C0 | X86_FSW_C2;
7071 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7072 fFsw |= X86_FSW_C2 | X86_FSW_C3;
7073 /* whatever else: 0 */
7074
7075 *pu16Fsw = fFsw;
7076}
7077
7078
7079/**
7080 * Worker for fcom, fucom, and friends.
7081 */
7082static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7083 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
7084{
7085 /*
7086 * Unpack the values.
7087 */
7088 bool const fSign1 = pr80Val1->s.fSign;
7089 int32_t iExponent1 = pr80Val1->s.uExponent;
7090 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
7091
7092 bool const fSign2 = pr80Val2->s.fSign;
7093 int32_t iExponent2 = pr80Val2->s.uExponent;
7094 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
7095
7096 /*
7097 * Check for invalid inputs.
7098 */
7099 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
7100 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
7101 {
7102 if (!(fFcw & X86_FCW_IM))
7103 fFsw |= X86_FSW_ES | X86_FSW_B;
7104 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7105 }
7106
7107 /*
7108 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
7109 */
7110 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7111 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7112 {
7113 if ( fIeOnAllNaNs
7114 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7115 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7116 {
7117 fFsw |= X86_FSW_IE;
7118 if (!(fFcw & X86_FCW_IM))
7119 fFsw |= X86_FSW_ES | X86_FSW_B;
7120 }
7121 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
7122 }
7123
7124 /*
7125 * Normalize the values.
7126 */
7127 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7128 {
7129 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7130 iExponent1 = 1;
7131 else
7132 {
7133 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7134 uMantissa1 <<= iExponent1;
7135 iExponent1 = 1 - iExponent1;
7136 }
7137 fFsw |= X86_FSW_DE;
7138 if (!(fFcw & X86_FCW_DM))
7139 fFsw |= X86_FSW_ES | X86_FSW_B;
7140 }
7141
7142 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7143 {
7144 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7145 iExponent2 = 1;
7146 else
7147 {
7148 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7149 uMantissa2 <<= iExponent2;
7150 iExponent2 = 1 - iExponent2;
7151 }
7152 fFsw |= X86_FSW_DE;
7153 if (!(fFcw & X86_FCW_DM))
7154 fFsw |= X86_FSW_ES | X86_FSW_B;
7155 }
7156
7157 /*
7158 * Test if equal (val1 == val2):
7159 */
7160 if ( uMantissa1 == uMantissa2
7161 && iExponent1 == iExponent2
7162 && ( fSign1 == fSign2
7163 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7164 fFsw |= X86_FSW_C3;
7165 /*
7166 * Test if less than (val1 < val2):
7167 */
7168 else if (fSign1 && !fSign2)
7169 fFsw |= X86_FSW_C0;
7170 else if (fSign1 == fSign2)
7171 {
7172 /* Zeros are problematic, however at the most one can be zero here. */
7173 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7174 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7175 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7176 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7177
7178 if ( fSign1
7179 ^ ( iExponent1 < iExponent2
7180 || ( iExponent1 == iExponent2
7181 && uMantissa1 < uMantissa2 ) ) )
7182 fFsw |= X86_FSW_C0;
7183 }
7184 /* else: No flags set if greater. */
7185
7186 return fFsw;
7187}
7188
7189
7190IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7191 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7192{
7193 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7194}
7195
7196
7197
7198
7199IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7200 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7201{
7202 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7203}
7204
7205
7206IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7207 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7208{
7209 RTFLOAT80U r80Val2;
7210 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7211 Assert(!fFsw || fFsw == X86_FSW_DE);
7212 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7213 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7214 {
7215 if (!(pFpuState->FCW & X86_FCW_DM))
7216 fFsw |= X86_FSW_ES | X86_FSW_B;
7217 *pfFsw |= fFsw;
7218 }
7219}
7220
7221
7222IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7223 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7224{
7225 RTFLOAT80U r80Val2;
7226 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7227 Assert(!fFsw || fFsw == X86_FSW_DE);
7228 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7229 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7230 {
7231 if (!(pFpuState->FCW & X86_FCW_DM))
7232 fFsw |= X86_FSW_ES | X86_FSW_B;
7233 *pfFsw |= fFsw;
7234 }
7235}
7236
7237
7238IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7239 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7240{
7241 RTFLOAT80U r80Val2;
7242 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7243 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7244}
7245
7246
7247IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7248 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7249{
7250 RTFLOAT80U r80Val2;
7251 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7252 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7253}
7254
7255
7256/**
7257 * Worker for fcomi & fucomi.
7258 */
7259static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7260 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7261{
7262 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7263 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7264 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7265 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7266
7267 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7268 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7269 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7270}
7271
7272
7273IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7274 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7275{
7276 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7277}
7278
7279
7280IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7281 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7282{
7283 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7284}
7285
7286
7287/*********************************************************************************************************************************
7288* x87 FPU Other Operations *
7289*********************************************************************************************************************************/
7290
7291/**
7292 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7293 */
7294static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7295{
7296 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7297 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7298 true /*exact / generate #PE */, &SoftState));
7299 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7300}
7301
7302
7303IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7304{
7305 uint16_t const fFcw = pFpuState->FCW;
7306 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7307
7308 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7309 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7310 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7311 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7312 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7313 || RTFLOAT80U_IS_INF(pr80Val))
7314 pFpuRes->r80Result = *pr80Val;
7315 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7316 {
7317 fFsw |= X86_FSW_DE;
7318 if (fFcw & X86_FCW_DM)
7319 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7320 else
7321 {
7322 pFpuRes->r80Result = *pr80Val;
7323 fFsw |= X86_FSW_ES | X86_FSW_B;
7324 }
7325 }
7326 else
7327 {
7328 if (fFcw & X86_FCW_IM)
7329 {
7330 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7331 pFpuRes->r80Result = g_r80Indefinite;
7332 else
7333 {
7334 pFpuRes->r80Result = *pr80Val;
7335 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7336 }
7337 }
7338 else
7339 {
7340 pFpuRes->r80Result = *pr80Val;
7341 fFsw |= X86_FSW_ES | X86_FSW_B;
7342 }
7343 fFsw |= X86_FSW_IE;
7344 }
7345 pFpuRes->FSW = fFsw;
7346}
7347
7348
7349IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7350 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7351{
7352 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7353 it does everything we need it to do. */
7354 uint16_t const fFcw = pFpuState->FCW;
7355 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7356 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7357 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7358 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7359}
7360
7361
7362/**
7363 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7364 */
7365static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7366{
7367 Assert(!pr80Val->s.fSign);
7368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7369 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7370 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7371}
7372
7373
7374IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7375{
7376 uint16_t const fFcw = pFpuState->FCW;
7377 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7378
7379 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7380 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7381 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7382 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7383 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7384 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7385 pFpuRes->r80Result = *pr80Val;
7386 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7387 {
7388 fFsw |= X86_FSW_DE;
7389 if (fFcw & X86_FCW_DM)
7390 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7391 else
7392 {
7393 pFpuRes->r80Result = *pr80Val;
7394 fFsw |= X86_FSW_ES | X86_FSW_B;
7395 }
7396 }
7397 else
7398 {
7399 if (fFcw & X86_FCW_IM)
7400 {
7401 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7402 pFpuRes->r80Result = g_r80Indefinite;
7403 else
7404 {
7405 pFpuRes->r80Result = *pr80Val;
7406 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7407 }
7408 }
7409 else
7410 {
7411 pFpuRes->r80Result = *pr80Val;
7412 fFsw |= X86_FSW_ES | X86_FSW_B;
7413 }
7414 fFsw |= X86_FSW_IE;
7415 }
7416 pFpuRes->FSW = fFsw;
7417}
7418
7419
7420/**
7421 * @code{.unparsed}
7422 * x x * ln2
7423 * f(x) = 2 - 1 = e - 1
7424 *
7425 * @endcode
7426 *
7427 * We can approximate e^x by a Taylor/Maclaurin series (see
7428 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7429 * @code{.unparsed}
7430 * n 0 1 2 3 4
7431 * inf x x x x x x
7432 * SUM ----- = --- + --- + --- + --- + --- + ...
7433 * n=0 n! 0! 1! 2! 3! 4!
7434 *
7435 * 2 3 4
7436 * x x x
7437 * = 1 + x + --- + --- + --- + ...
7438 * 2! 3! 4!
7439 * @endcode
7440 *
7441 * Given z = x * ln2, we get:
7442 * @code{.unparsed}
7443 * 2 3 4 n
7444 * z z z z z
7445 * e - 1 = z + --- + --- + --- + ... + ---
7446 * 2! 3! 4! n!
7447 * @endcode
7448 *
7449 * Wanting to use Horner's method, we move one z outside and get:
7450 * @code{.unparsed}
7451 * 2 3 (n-1)
7452 * z z z z
7453 * = z ( 1 + --- + --- + --- + ... + ------- )
7454 * 2! 3! 4! n!
7455 * @endcode
7456 *
7457 * The constants we need for using Horner's methods are 1 and 1 / n!.
7458 *
7459 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7460 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7461 * and can approximate it to be 1.0. For a visual demonstration of this
7462 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7463 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7464 *
7465 *
7466 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7467 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7468 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7469 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7470 * blocks). (The one bit difference is probably an implicit one missing from
7471 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7472 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7473 * exponent.
7474 *
7475 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7476 * successfully reproduced the exact results from an Intel 10980XE, there is
7477 * always a portition of rounding differences. Not going to spend too much time
7478 * on getting this 100% the same, at least not now.
7479 *
7480 * P.S. If someone are really curious about 8087 and its contstants:
7481 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7482 *
7483 *
7484 * @param pr80Val The exponent value (x), less than 1.0, greater than
7485 * -1.0 and not zero. This can be a normal, denormal
7486 * or pseudo-denormal value.
7487 * @param pr80Result Where to return the result.
7488 * @param fFcw FPU control word.
7489 * @param fFsw FPU status word.
7490 */
7491static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7492{
7493 /* As mentioned above, we can skip the expensive polynomial calculation
7494 as it will be close enough to 1.0 that it makes no difference.
7495
7496 The cutoff point for intel 10980XE is exponents >= -69. Intel
7497 also seems to be using a 67-bit or 68-bit constant value, and we get
7498 a smattering of rounding differences if we go for higher precision. */
7499 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7500 {
7501 RTUINT256U u256;
7502 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7503 u256.QWords.qw0 |= 1; /* force #PE */
7504 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7505 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7506 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7507 : 1 - RTFLOAT80U_EXP_BIAS,
7508 fFcw, fFsw);
7509 }
7510 else
7511 {
7512#ifdef IEM_WITH_FLOAT128_FOR_FPU
7513 /* This approach is not good enough for small values, we end up with zero. */
7514 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7515 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7516 _Float128 rd128Result = powf128(2.0L, rd128Val);
7517 rd128Result -= 1.0L;
7518 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7519 iemFpuF128RestoreRounding(fOldRounding);
7520
7521# else
7522 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7523 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7524
7525 /* As mentioned above, enforce 68-bit internal mantissa width to better
7526 match the Intel 10980XE results. */
7527 unsigned const cPrecision = 68;
7528
7529 /* first calculate z = x * ln2 */
7530 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7531 cPrecision);
7532
7533 /* Then do the polynomial evaluation. */
7534 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7535 cPrecision, &SoftState);
7536 r = f128_mul(z, r, &SoftState);
7537
7538 /* Output the result. */
7539 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7540# endif
7541 }
7542 return fFsw;
7543}
7544
7545
7546IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7547{
7548 uint16_t const fFcw = pFpuState->FCW;
7549 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7550
7551 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7552 {
7553 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7554 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7555 else
7556 {
7557 /* Special case:
7558 2^+1.0 - 1.0 = 1.0
7559 2^-1.0 - 1.0 = -0.5 */
7560 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7561 && pr80Val->s.uMantissa == RT_BIT_64(63))
7562 {
7563 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7564 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7565 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7566 }
7567 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7568 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7569 else
7570 pFpuRes->r80Result = *pr80Val;
7571 fFsw |= X86_FSW_PE;
7572 if (!(fFcw & X86_FCW_PM))
7573 fFsw |= X86_FSW_ES | X86_FSW_B;
7574 }
7575 }
7576 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7577 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7578 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7579 pFpuRes->r80Result = *pr80Val;
7580 else if (RTFLOAT80U_IS_INF(pr80Val))
7581 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7582 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7583 {
7584 fFsw |= X86_FSW_DE;
7585 if (fFcw & X86_FCW_DM)
7586 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7587 else
7588 {
7589 pFpuRes->r80Result = *pr80Val;
7590 fFsw |= X86_FSW_ES | X86_FSW_B;
7591 }
7592 }
7593 else
7594 {
7595 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7596 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7597 && (fFcw & X86_FCW_IM))
7598 pFpuRes->r80Result = g_r80Indefinite;
7599 else
7600 {
7601 pFpuRes->r80Result = *pr80Val;
7602 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7603 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7604 }
7605 fFsw |= X86_FSW_IE;
7606 if (!(fFcw & X86_FCW_IM))
7607 fFsw |= X86_FSW_ES | X86_FSW_B;
7608 }
7609 pFpuRes->FSW = fFsw;
7610}
7611
7612#endif /* IEM_WITHOUT_ASSEMBLY */
7613
7614IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7615{
7616 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7617}
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7620{
7621 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7622}
7623
7624#ifdef IEM_WITHOUT_ASSEMBLY
7625
7626IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7627{
7628 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7629 pFpuRes->r80Result = *pr80Val;
7630 pFpuRes->r80Result.s.fSign = 0;
7631}
7632
7633
7634IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7635{
7636 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7637 pFpuRes->r80Result = *pr80Val;
7638 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7639}
7640
7641
7642IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7643{
7644 uint16_t const fFcw = pFpuState->FCW;
7645 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7646
7647 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7648 {
7649 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7650 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7651
7652 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7653 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7654 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7655 }
7656 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7657 {
7658 fFsw |= X86_FSW_ZE;
7659 if (fFcw & X86_FCW_ZM)
7660 {
7661 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7662 pFpuResTwo->r80Result2 = *pr80Val;
7663 }
7664 else
7665 {
7666 pFpuResTwo->r80Result2 = *pr80Val;
7667 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7668 }
7669 }
7670 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7671 {
7672 fFsw |= X86_FSW_DE;
7673 if (fFcw & X86_FCW_DM)
7674 {
7675 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7676 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7677 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7678 int32_t iExponent = -16382;
7679 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7680 {
7681 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7682 iExponent--;
7683 }
7684
7685 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7686 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7687 }
7688 else
7689 {
7690 pFpuResTwo->r80Result2 = *pr80Val;
7691 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7692 }
7693 }
7694 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7695 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7696 {
7697 pFpuResTwo->r80Result1 = *pr80Val;
7698 pFpuResTwo->r80Result2 = *pr80Val;
7699 }
7700 else if (RTFLOAT80U_IS_INF(pr80Val))
7701 {
7702 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7703 pFpuResTwo->r80Result2 = *pr80Val;
7704 }
7705 else
7706 {
7707 if (fFcw & X86_FCW_IM)
7708 {
7709 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7710 pFpuResTwo->r80Result1 = g_r80Indefinite;
7711 else
7712 {
7713 pFpuResTwo->r80Result1 = *pr80Val;
7714 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7715 }
7716 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7717 }
7718 else
7719 {
7720 pFpuResTwo->r80Result2 = *pr80Val;
7721 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7722 }
7723 fFsw |= X86_FSW_IE;
7724 }
7725 pFpuResTwo->FSW = fFsw;
7726}
7727#endif /* IEM_WITHOUT_ASSEMBLY */
7728
7729#if defined(IEM_WITHOUT_ASSEMBLY)
7730
7731static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7732{
7733 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7734 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7735 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7736 extFloat80_t v;
7737 (void)fFcw;
7738
7739 v = extF80_ylog2x(y, x, &SoftState);
7740 iemFpuSoftF80ToIprt(pr80Result, v);
7741
7742 return fFsw;
7743}
7744
7745IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7746 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7747{
7748 uint16_t const fFcw = pFpuState->FCW;
7749 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7750
7751 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7752 {
7753 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7754
7755 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7756 if (!(fFcw & X86_FCW_PM))
7757 fFsw |= X86_FSW_ES | X86_FSW_B;
7758 }
7759 else
7760 {
7761 fFsw |= X86_FSW_IE;
7762
7763 if (!(fFcw & X86_FCW_IM))
7764 {
7765 pFpuRes->r80Result = *pr80Val2;
7766 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7767 }
7768 else
7769 {
7770 pFpuRes->r80Result = g_r80Indefinite;
7771 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7772 }
7773 }
7774
7775 pFpuRes->FSW = fFsw;
7776}
7777#endif /* IEM_WITHOUT_ASSEMBLY */
7778
7779IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7780 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7781{
7782 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7783}
7784
7785IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7786 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7787{
7788 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7789}
7790
7791#if defined(IEM_WITHOUT_ASSEMBLY)
7792
7793static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7794{
7795 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7796 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7797 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7798 extFloat80_t v;
7799 (void)fFcw;
7800
7801 v = extF80_ylog2xp1(y, x, &SoftState);
7802 iemFpuSoftF80ToIprt(pr80Result, v);
7803
7804 return fFsw;
7805}
7806
7807IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7808 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7809{
7810 uint16_t const fFcw = pFpuState->FCW;
7811 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7812
7813 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7814 {
7815 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7816
7817 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7818 if (!(fFcw & X86_FCW_PM))
7819 fFsw |= X86_FSW_ES | X86_FSW_B;
7820 }
7821 else
7822 {
7823 fFsw |= X86_FSW_IE;
7824
7825 if (!(fFcw & X86_FCW_IM))
7826 {
7827 pFpuRes->r80Result = *pr80Val2;
7828 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7829 }
7830 else
7831 {
7832 pFpuRes->r80Result = g_r80Indefinite;
7833 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7834 }
7835 }
7836
7837 pFpuRes->FSW = fFsw;
7838}
7839
7840#endif /* IEM_WITHOUT_ASSEMBLY */
7841
7842IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7843 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7844{
7845 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7846}
7847
7848IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7849 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7850{
7851 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7852}
7853
7854
7855/*********************************************************************************************************************************
7856* MMX, SSE & AVX *
7857*********************************************************************************************************************************/
7858
7859/*
7860 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7861 */
7862#ifdef IEM_WITHOUT_ASSEMBLY
7863
7864IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(uint64_t *puDst, uint64_t const *puSrc))
7865{
7866 *puDst &= *puSrc;
7867}
7868
7869
7870IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7871{
7872 puDst->au64[0] &= puSrc->au64[0];
7873 puDst->au64[1] &= puSrc->au64[1];
7874}
7875
7876#endif
7877
7878IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7879{
7880 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7881 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7882}
7883
7884
7885IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7886{
7887 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7888 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7889 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7890 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7891}
7892
7893
7894/*
7895 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7896 */
7897#ifdef IEM_WITHOUT_ASSEMBLY
7898
7899IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(uint64_t *puDst, uint64_t const *puSrc))
7900{
7901 *puDst = ~*puDst & *puSrc;
7902}
7903
7904
7905IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7906{
7907 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7908 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7909}
7910
7911#endif
7912
7913IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7914{
7915 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7916 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7917}
7918
7919
7920IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7921{
7922 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7923 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7924 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7925 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7926}
7927
7928
7929/*
7930 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7931 */
7932#ifdef IEM_WITHOUT_ASSEMBLY
7933
7934IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(uint64_t *puDst, uint64_t const *puSrc))
7935{
7936 *puDst |= *puSrc;
7937}
7938
7939
7940IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7941{
7942 puDst->au64[0] |= puSrc->au64[0];
7943 puDst->au64[1] |= puSrc->au64[1];
7944}
7945
7946#endif
7947
7948IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7949{
7950 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7951 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7952}
7953
7954
7955IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7956{
7957 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7958 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7959 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7960 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7961}
7962
7963
7964/*
7965 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7966 */
7967#ifdef IEM_WITHOUT_ASSEMBLY
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(uint64_t *puDst, uint64_t const *puSrc))
7970{
7971 *puDst ^= *puSrc;
7972}
7973
7974
7975IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7976{
7977 puDst->au64[0] ^= puSrc->au64[0];
7978 puDst->au64[1] ^= puSrc->au64[1];
7979}
7980
7981#endif
7982
7983IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7984{
7985 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7986 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7987}
7988
7989
7990IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7991{
7992 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7993 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7994 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7995 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7996}
7997
7998
7999/*
8000 * PCMPEQB / VPCMPEQB
8001 */
8002#ifdef IEM_WITHOUT_ASSEMBLY
8003
8004IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8005{
8006 RTUINT64U uSrc1 = { *puDst };
8007 RTUINT64U uSrc2 = { *puSrc };
8008 RTUINT64U uDst;
8009 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
8010 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8011 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8012 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8013 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8014 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8015 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8016 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8017 *puDst = uDst.u;
8018}
8019
8020
8021IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8022{
8023 RTUINT128U uSrc1 = *puDst;
8024 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8025 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8026 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8027 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8028 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8029 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8030 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8031 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8032 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8033 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8034 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8035 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8036 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8037 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8038 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8039 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8040}
8041
8042#endif
8043
8044IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8045{
8046 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8047 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8048 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8049 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8050 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8051 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8052 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8053 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8054 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8055 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8056 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8057 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8058 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8059 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8060 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8061 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8062}
8063
8064IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8065{
8066 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8067 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8068 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8069 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8070 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8071 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8072 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8073 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8074 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8075 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8076 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8077 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8078 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8079 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8080 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8081 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8082 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8083 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8084 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8085 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8086 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8087 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8088 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8089 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8090 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8091 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8092 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8093 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8094 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8095 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8096 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8097 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8098}
8099
8100
8101/*
8102 * PCMPEQW / VPCMPEQW
8103 */
8104#ifdef IEM_WITHOUT_ASSEMBLY
8105
8106IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8107{
8108 RTUINT64U uSrc1 = { *puDst };
8109 RTUINT64U uSrc2 = { *puSrc };
8110 RTUINT64U uDst;
8111 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8112 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8113 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8114 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8115 *puDst = uDst.u;
8116}
8117
8118
8119IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8120{
8121 RTUINT128U uSrc1 = *puDst;
8122 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8123 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8124 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8125 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8126 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8127 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8128 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8129 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8130}
8131
8132#endif
8133
8134IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8135{
8136 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8137 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8138 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8139 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8140 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8141 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8142 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8143 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8144}
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8147{
8148 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8149 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8150 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8151 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8152 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8153 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8154 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8155 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8156 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8157 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8158 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8159 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8160 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8161 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8162 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8163 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8164}
8165
8166
8167/*
8168 * PCMPEQD / VPCMPEQD.
8169 */
8170#ifdef IEM_WITHOUT_ASSEMBLY
8171
8172IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8173{
8174 RTUINT64U uSrc1 = { *puDst };
8175 RTUINT64U uSrc2 = { *puSrc };
8176 RTUINT64U uDst;
8177 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8178 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8179 *puDst = uDst.u;
8180}
8181
8182
8183IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8184{
8185 RTUINT128U uSrc1 = *puDst;
8186 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8187 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8188 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8189 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8190}
8191
8192#endif /* IEM_WITHOUT_ASSEMBLY */
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8195{
8196 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8197 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8198 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8199 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8200}
8201
8202IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8203{
8204 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8205 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8206 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8207 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8208 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8209 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8210 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8211 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8212}
8213
8214
8215/*
8216 * PCMPEQQ / VPCMPEQQ.
8217 */
8218IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8219{
8220 RTUINT128U uSrc1 = *puDst;
8221 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8222 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8223}
8224
8225IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8226{
8227 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8228 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8229}
8230
8231IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8232{
8233 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8234 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8235 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8236 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8237}
8238
8239
8240/*
8241 * PCMPGTB / VPCMPGTB
8242 */
8243#ifdef IEM_WITHOUT_ASSEMBLY
8244
8245IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8246{
8247 RTUINT64U uSrc1 = { *puDst };
8248 RTUINT64U uSrc2 = { *puSrc };
8249 RTUINT64U uDst;
8250 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8251 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8252 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8253 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8254 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8255 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8256 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8257 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8258 *puDst = uDst.u;
8259}
8260
8261
8262IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8263{
8264 RTUINT128U uSrc1 = *puDst;
8265 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8266 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8267 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8268 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8269 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8270 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8271 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8272 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8273 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8274 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8275 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8276 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8277 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8278 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8279 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8280 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8281}
8282
8283#endif
8284
8285IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8286{
8287 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8288 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8289 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8290 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8291 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8292 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8293 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8294 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8295 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8296 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8297 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8298 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8299 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8300 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8301 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8302 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8303}
8304
8305IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8306{
8307 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8308 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8309 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8310 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8311 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8312 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8313 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8314 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8315 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8316 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8317 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8318 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8319 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8320 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8321 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8322 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8323 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8324 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8325 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8326 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8327 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8328 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8329 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8330 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8331 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8332 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8333 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8334 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8335 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8336 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8337 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8338 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8339}
8340
8341
8342/*
8343 * PCMPGTW / VPCMPGTW
8344 */
8345#ifdef IEM_WITHOUT_ASSEMBLY
8346
8347IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8348{
8349 RTUINT64U uSrc1 = { *puDst };
8350 RTUINT64U uSrc2 = { *puSrc };
8351 RTUINT64U uDst;
8352 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8353 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8354 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8355 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8356 *puDst = uDst.u;
8357}
8358
8359
8360IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8361{
8362 RTUINT128U uSrc1 = *puDst;
8363 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8364 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8365 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8366 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8367 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8368 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8369 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8370 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8371}
8372
8373#endif
8374
8375IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8376{
8377 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8378 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8379 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8380 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8381 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8382 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8383 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8384 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8385}
8386
8387IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8388{
8389 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8390 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8391 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8392 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8393 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8394 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8395 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8396 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8397 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8398 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8399 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8400 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8401 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8402 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8403 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8404 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8405}
8406
8407
8408/*
8409 * PCMPGTD / VPCMPGTD.
8410 */
8411#ifdef IEM_WITHOUT_ASSEMBLY
8412
8413IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8414{
8415 RTUINT64U uSrc1 = { *puDst };
8416 RTUINT64U uSrc2 = { *puSrc };
8417 RTUINT64U uDst;
8418 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8419 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8420 *puDst = uDst.u;
8421}
8422
8423
8424IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8425{
8426 RTUINT128U uSrc1 = *puDst;
8427 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8428 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8429 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8430 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8431}
8432
8433#endif /* IEM_WITHOUT_ASSEMBLY */
8434
8435IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8436{
8437 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8438 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8439 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8440 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8441}
8442
8443IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8444{
8445 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8446 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8447 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8448 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8449 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8450 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8451 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8452 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8453}
8454
8455
8456/*
8457 * PCMPGTQ / VPCMPGTQ.
8458 */
8459IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8460{
8461 RTUINT128U uSrc1 = *puDst;
8462 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8463 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8464}
8465
8466IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8467{
8468 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8469 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8470}
8471
8472IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8473{
8474 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8475 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8476 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8477 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8478}
8479
8480
8481/*
8482 * PADDB / VPADDB
8483 */
8484#ifdef IEM_WITHOUT_ASSEMBLY
8485
8486IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8487{
8488 RTUINT64U uSrc1 = { *puDst };
8489 RTUINT64U uSrc2 = { *puSrc };
8490 RTUINT64U uDst;
8491 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8492 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8493 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8494 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8495 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8496 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8497 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8498 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8499 *puDst = uDst.u;
8500}
8501
8502
8503IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8504{
8505 RTUINT128U uSrc1 = *puDst;
8506 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8507 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8508 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8509 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8510 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8511 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8512 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8513 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8514 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8515 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8516 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8517 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8518 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8519 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8520 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8521 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8522}
8523
8524#endif
8525
8526
8527IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8528{
8529 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8530 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8531 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8532 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8533 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8534 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8535 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8536 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8537 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8538 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8539 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8540 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8541 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8542 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8543 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8544 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8545}
8546
8547IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8548{
8549 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8550 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8551 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8552 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8553 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8554 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8555 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8556 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8557 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8558 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8559 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8560 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8561 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8562 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8563 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8564 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8565 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8566 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8567 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8568 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8569 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8570 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8571 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8572 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8573 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8574 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8575 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8576 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8577 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8578 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8579 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8580 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8581}
8582
8583
8584/*
8585 * PADDSB / VPADDSB
8586 */
8587#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8588 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8589 ? (uint8_t)(a_iWord) \
8590 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8591
8592#ifdef IEM_WITHOUT_ASSEMBLY
8593
8594IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8595{
8596 RTUINT64U uSrc1 = { *puDst };
8597 RTUINT64U uSrc2 = { *puSrc };
8598 RTUINT64U uDst;
8599 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8600 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8601 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8602 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8603 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8604 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8605 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8606 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8607 *puDst = uDst.u;
8608}
8609
8610
8611IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8612{
8613 RTUINT128U uSrc1 = *puDst;
8614 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8615 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8616 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8617 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8618 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8619 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8620 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8621 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8622 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8623 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8624 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8625 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8626 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8627 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8628 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8629 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8630}
8631
8632#endif
8633
8634IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8635 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8636{
8637 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8638 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8639 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8640 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8641 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8642 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8643 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8644 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8645 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8646 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8647 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8648 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8649 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8650 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8651 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8652 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8653}
8654
8655IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8656 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8657{
8658 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8659 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8660 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8661 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8662 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8663 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8664 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8665 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8666 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8667 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8668 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8669 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8670 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8671 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8672 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8673 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8674 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8675 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8676 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8677 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8678 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8679 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8680 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8681 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8682 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8683 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8684 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8685 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8686 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8687 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8688 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8689 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8690}
8691
8692
8693/*
8694 * PADDUSB / VPADDUSB
8695 */
8696#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8697 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8698 ? (uint8_t)(a_uWord) \
8699 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8700
8701#ifdef IEM_WITHOUT_ASSEMBLY
8702
8703IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8704{
8705 RTUINT64U uSrc1 = { *puDst };
8706 RTUINT64U uSrc2 = { *puSrc };
8707 RTUINT64U uDst;
8708 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8709 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8710 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8711 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8712 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8713 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8714 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8715 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8716 *puDst = uDst.u;
8717}
8718
8719
8720IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8721{
8722 RTUINT128U uSrc1 = *puDst;
8723 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8724 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8725 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8726 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8727 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8728 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8729 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8730 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8731 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8732 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8733 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8734 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8735 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8736 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8737 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8738 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8739}
8740
8741#endif
8742
8743IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8744 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8745{
8746 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8747 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8748 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8749 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8750 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8751 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8752 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8753 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8754 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8755 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8756 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8757 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8758 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8759 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8760 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8761 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8762}
8763
8764IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8765 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8766{
8767 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8768 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8769 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8770 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8771 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8772 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8773 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8774 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8775 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8776 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8777 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8778 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8779 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8780 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8781 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8782 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8783 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8784 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8785 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8786 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8787 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8788 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8789 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8790 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8791 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8792 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8793 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8794 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8795 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8796 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8797 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8798 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8799}
8800
8801
8802/*
8803 * PADDW / VPADDW
8804 */
8805#ifdef IEM_WITHOUT_ASSEMBLY
8806
8807IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8808{
8809 RTUINT64U uSrc1 = { *puDst };
8810 RTUINT64U uSrc2 = { *puSrc };
8811 RTUINT64U uDst;
8812 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8813 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8814 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8815 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8816 *puDst = uDst.u;
8817}
8818
8819
8820IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8821{
8822 RTUINT128U uSrc1 = *puDst;
8823 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8824 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8825 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8826 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8827 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8828 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8829 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8830 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8831}
8832
8833#endif
8834
8835
8836IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8837{
8838 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8839 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8840 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8841 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8842 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8843 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8844 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8845 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8846}
8847
8848IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8849{
8850 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8851 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8852 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8853 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8854 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8855 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8856 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8857 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8858 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8859 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8860 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8861 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8862 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8863 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8864 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8865 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8866}
8867
8868
8869/*
8870 * PADDSW / VPADDSW
8871 */
8872#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8873 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8874 ? (uint16_t)(a_iDword) \
8875 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8876
8877#ifdef IEM_WITHOUT_ASSEMBLY
8878
8879IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8880{
8881 RTUINT64U uSrc1 = { *puDst };
8882 RTUINT64U uSrc2 = { *puSrc };
8883 RTUINT64U uDst;
8884 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8885 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8886 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8887 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8888 *puDst = uDst.u;
8889}
8890
8891
8892IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8893{
8894 RTUINT128U uSrc1 = *puDst;
8895 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8896 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8897 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8898 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8899 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8900 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8901 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8902 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8903}
8904
8905#endif
8906
8907IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8908 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8909{
8910 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8911 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8912 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8913 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8914 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8915 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8916 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8917 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8918}
8919
8920IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8921 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8922{
8923 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8924 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8925 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8926 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8927 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8928 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8929 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8930 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8931 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8932 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8933 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8934 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8935 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8936 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8937 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8938 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8939}
8940
8941
8942/*
8943 * PADDUSW / VPADDUSW
8944 */
8945#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8946 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8947 ? (uint16_t)(a_uDword) \
8948 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8949
8950#ifdef IEM_WITHOUT_ASSEMBLY
8951
8952IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8953{
8954 RTUINT64U uSrc1 = { *puDst };
8955 RTUINT64U uSrc2 = { *puSrc };
8956 RTUINT64U uDst;
8957 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8958 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8959 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8960 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8961 *puDst = uDst.u;
8962}
8963
8964
8965IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8966{
8967 RTUINT128U uSrc1 = *puDst;
8968 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8969 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8970 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8971 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8972 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8973 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8974 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8975 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8976}
8977
8978#endif
8979
8980IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8981 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8982{
8983 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8984 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8985 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8986 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8987 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8988 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8989 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8990 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8991}
8992
8993IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8994 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8995{
8996 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8997 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8998 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8999 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9000 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9001 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9002 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9003 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9004 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9005 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9006 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9007 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9008 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9009 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9010 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9011 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9012}
9013
9014
9015/*
9016 * PADDD / VPADDD.
9017 */
9018#ifdef IEM_WITHOUT_ASSEMBLY
9019
9020IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9021{
9022 RTUINT64U uSrc1 = { *puDst };
9023 RTUINT64U uSrc2 = { *puSrc };
9024 RTUINT64U uDst;
9025 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9026 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9027 *puDst = uDst.u;
9028}
9029
9030
9031IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9032{
9033 RTUINT128U uSrc1 = *puDst;
9034 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9035 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9036 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9037 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9038}
9039
9040#endif /* IEM_WITHOUT_ASSEMBLY */
9041
9042IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9043{
9044 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9045 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9046 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9047 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9048}
9049
9050IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9051{
9052 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9053 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9054 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9055 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9056 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9057 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9058 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9059 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9060}
9061
9062
9063/*
9064 * PADDQ / VPADDQ.
9065 */
9066#ifdef IEM_WITHOUT_ASSEMBLY
9067
9068IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9069{
9070 *puDst = *puDst + *puSrc;
9071}
9072
9073IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9074{
9075 RTUINT128U uSrc1 = *puDst;
9076 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9077 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9078}
9079
9080#endif
9081
9082IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9083{
9084 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9085 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9086}
9087
9088IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9089{
9090 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9091 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9092 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9093 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9094}
9095
9096
9097/*
9098 * PSUBB / VPSUBB
9099 */
9100#ifdef IEM_WITHOUT_ASSEMBLY
9101
9102IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9103{
9104 RTUINT64U uSrc1 = { *puDst };
9105 RTUINT64U uSrc2 = { *puSrc };
9106 RTUINT64U uDst;
9107 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9108 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9109 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9110 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9111 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9112 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9113 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9114 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9115 *puDst = uDst.u;
9116}
9117
9118
9119IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9120{
9121 RTUINT128U uSrc1 = *puDst;
9122 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9123 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9124 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9125 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9126 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9127 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9128 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9129 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9130 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9131 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9132 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9133 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9134 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9135 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9136 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9137 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9138}
9139
9140#endif
9141
9142IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9143{
9144 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9145 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9146 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9147 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9148 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9149 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9150 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9151 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9152 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9153 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9154 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9155 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9156 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9157 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9158 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9159 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9160}
9161
9162IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9163{
9164 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9165 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9166 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9167 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9168 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9169 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9170 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9171 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9172 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9173 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9174 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9175 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9176 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9177 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9178 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9179 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9180 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9181 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9182 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9183 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9184 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9185 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9186 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9187 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9188 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9189 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9190 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9191 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9192 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9193 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9194 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9195 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9196}
9197
9198
9199/*
9200 * PSUBSB / VSUBSB
9201 */
9202#ifdef IEM_WITHOUT_ASSEMBLY
9203
9204IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9205{
9206 RTUINT64U uSrc1 = { *puDst };
9207 RTUINT64U uSrc2 = { *puSrc };
9208 RTUINT64U uDst;
9209 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9210 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9211 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9212 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9213 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9214 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9215 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9216 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9217 *puDst = uDst.u;
9218}
9219
9220
9221IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9222{
9223 RTUINT128U uSrc1 = *puDst;
9224 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9225 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9226 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9227 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9228 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9229 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9230 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9231 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9232 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9233 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9234 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9235 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9236 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9237 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9238 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9239 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9240}
9241
9242#endif
9243
9244IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9245 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9246{
9247 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9248 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9249 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9250 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9251 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9252 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9253 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9254 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9255 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9256 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9257 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9258 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9259 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9260 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9261 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9262 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9263}
9264
9265IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9266 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9267{
9268 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9269 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9270 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9271 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9272 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9273 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9274 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9275 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9276 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9277 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9278 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9279 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9280 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9281 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9282 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9283 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9284 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9285 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9286 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9287 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9288 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9289 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9290 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9291 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9292 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9293 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9294 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9295 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9296 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9297 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9298 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9299 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9300}
9301
9302
9303/*
9304 * PSUBUSB / VPSUBUSW
9305 */
9306#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9307 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9308 ? (uint8_t)(a_uWord) \
9309 : (uint8_t)0 )
9310
9311#ifdef IEM_WITHOUT_ASSEMBLY
9312
9313IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9314{
9315 RTUINT64U uSrc1 = { *puDst };
9316 RTUINT64U uSrc2 = { *puSrc };
9317 RTUINT64U uDst;
9318 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9319 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9320 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9321 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9322 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9323 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9324 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9325 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9326 *puDst = uDst.u;
9327}
9328
9329
9330IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9331{
9332 RTUINT128U uSrc1 = *puDst;
9333 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9334 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9335 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9336 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9337 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9338 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9339 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9340 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9341 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9342 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9343 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9344 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9345 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9346 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9347 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9348 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9349}
9350
9351#endif
9352
9353IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9354 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9355{
9356 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9357 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9358 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9359 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9360 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9361 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9362 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9363 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9364 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9365 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9366 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9367 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9368 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9369 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9370 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9371 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9372}
9373
9374IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9375 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9376{
9377 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9378 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9379 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9380 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9381 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9382 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9383 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9384 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9385 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9386 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9387 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9388 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9389 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9390 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9391 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9392 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9393 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9394 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9395 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9396 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9397 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9398 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9399 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9400 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9401 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9402 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9403 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9404 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9405 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9406 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9407 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9408 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9409}
9410
9411
9412/*
9413 * PSUBW / VPSUBW
9414 */
9415#ifdef IEM_WITHOUT_ASSEMBLY
9416
9417IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9418{
9419 RTUINT64U uSrc1 = { *puDst };
9420 RTUINT64U uSrc2 = { *puSrc };
9421 RTUINT64U uDst;
9422 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9423 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9424 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9425 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9426 *puDst = uDst.u;
9427}
9428
9429
9430IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9431{
9432 RTUINT128U uSrc1 = *puDst;
9433 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9434 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9435 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9436 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9437 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9438 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9439 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9440 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9441}
9442
9443#endif
9444
9445IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9446{
9447 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9448 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9449 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9450 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9451 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9452 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9453 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9454 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9455}
9456
9457IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9458{
9459 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9460 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9461 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9462 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9463 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9464 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9465 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9466 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9467 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9468 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9469 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9470 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9471 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9472 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9473 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9474 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9475}
9476
9477
9478/*
9479 * PSUBSW / VPSUBSW
9480 */
9481#ifdef IEM_WITHOUT_ASSEMBLY
9482
9483IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9484{
9485 RTUINT64U uSrc1 = { *puDst };
9486 RTUINT64U uSrc2 = { *puSrc };
9487 RTUINT64U uDst;
9488 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9489 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9490 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9491 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9492 *puDst = uDst.u;
9493}
9494
9495
9496IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9497{
9498 RTUINT128U uSrc1 = *puDst;
9499 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9500 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9501 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9502 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9503 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9504 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9505 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9506 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9507}
9508
9509#endif
9510
9511IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9512 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9513{
9514 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9515 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9516 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9517 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9518 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9519 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9520 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9521 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9522}
9523
9524IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9525 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9526{
9527 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9528 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9529 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9530 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9531 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9532 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9533 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9534 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9535 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9536 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9537 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9538 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9539 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9540 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9541 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9542 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9543}
9544
9545
9546/*
9547 * PSUBUSW / VPSUBUSW
9548 */
9549#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9550 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9551 ? (uint16_t)(a_uDword) \
9552 : (uint16_t)0 )
9553
9554#ifdef IEM_WITHOUT_ASSEMBLY
9555
9556IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9557{
9558 RTUINT64U uSrc1 = { *puDst };
9559 RTUINT64U uSrc2 = { *puSrc };
9560 RTUINT64U uDst;
9561 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9562 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9563 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9564 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9565 *puDst = uDst.u;
9566}
9567
9568
9569IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9570{
9571 RTUINT128U uSrc1 = *puDst;
9572 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9573 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9574 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9575 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9576 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9577 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9578 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9579 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9580}
9581
9582#endif
9583
9584IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9585 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9586{
9587 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9588 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9589 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9590 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9591 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9592 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9593 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9594 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9595}
9596
9597IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9598 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9599{
9600 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9601 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9602 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9603 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9604 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9605 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9606 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9607 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9608 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9609 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9610 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9611 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9612 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9613 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9614 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9615 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9616}
9617
9618
9619
9620/*
9621 * PSUBD / VPSUBD.
9622 */
9623#ifdef IEM_WITHOUT_ASSEMBLY
9624
9625IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9626{
9627 RTUINT64U uSrc1 = { *puDst };
9628 RTUINT64U uSrc2 = { *puSrc };
9629 RTUINT64U uDst;
9630 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9631 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9632 *puDst = uDst.u;
9633}
9634
9635
9636IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9637{
9638 RTUINT128U uSrc1 = *puDst;
9639 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9640 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9641 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9642 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9643}
9644
9645#endif /* IEM_WITHOUT_ASSEMBLY */
9646
9647IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9648{
9649 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9650 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9651 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9652 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9653}
9654
9655IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9656{
9657 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9658 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9659 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9660 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9661 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9662 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9663 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9664 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9665}
9666
9667
9668/*
9669 * PSUBQ / VPSUBQ.
9670 */
9671#ifdef IEM_WITHOUT_ASSEMBLY
9672
9673IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9674{
9675 *puDst = *puDst - *puSrc;
9676}
9677
9678IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9679{
9680 RTUINT128U uSrc1 = *puDst;
9681 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9682 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9683}
9684
9685#endif
9686
9687IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9688{
9689 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9690 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9691}
9692
9693IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9694{
9695 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9696 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9697 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9698 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9699}
9700
9701
9702
9703/*
9704 * PMULLW / VPMULLW / PMULLD / VPMULLD
9705 */
9706#ifdef IEM_WITHOUT_ASSEMBLY
9707
9708IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9709{
9710 RTUINT64U uSrc1 = { *puDst };
9711 RTUINT64U uSrc2 = { *puSrc };
9712 RTUINT64U uDst;
9713 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9714 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9715 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9716 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9717 *puDst = uDst.u;
9718}
9719
9720
9721IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9722{
9723 RTUINT128U uSrc1 = *puDst;
9724 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9725 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9726 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9727 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9728 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9729 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9730 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9731 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9732}
9733
9734#endif
9735
9736IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9737{
9738 RTUINT128U uSrc1 = *puDst;
9739
9740 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9741 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9742 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9743 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9744}
9745
9746
9747IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9748{
9749 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9750 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9751 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9752 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9753 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9754 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9755 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9756 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9757}
9758
9759
9760IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9761{
9762 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9763 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9764 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9765 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9766 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9767 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9768 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9769 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9770 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9771 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9772 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9773 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9774 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9775 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9776 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9777 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9778}
9779
9780
9781IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9782{
9783 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9784 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9785 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9786 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9787}
9788
9789
9790IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9791{
9792 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9793 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9794 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9795 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9796 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9797 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9798 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9799 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9800}
9801
9802
9803/*
9804 * PMULHW / VPMULHW
9805 */
9806#ifdef IEM_WITHOUT_ASSEMBLY
9807
9808IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9809{
9810 RTUINT64U uSrc1 = { *puDst };
9811 RTUINT64U uSrc2 = { *puSrc };
9812 RTUINT64U uDst;
9813 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9814 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9815 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9816 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9817 *puDst = uDst.u;
9818}
9819
9820
9821IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9822{
9823 RTUINT128U uSrc1 = *puDst;
9824 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9825 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9826 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9827 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9828 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9829 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9830 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9831 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9832}
9833
9834#endif
9835
9836IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9837{
9838 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9839 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9840 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9841 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9842 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9843 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9844 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9845 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9846}
9847
9848
9849IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9850{
9851 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9852 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9853 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9854 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9855 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9856 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9857 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9858 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9859 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9860 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9861 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9862 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9863 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9864 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9865 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9866 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9867}
9868
9869
9870/*
9871 * PMULHUW / VPMULHUW
9872 */
9873#ifdef IEM_WITHOUT_ASSEMBLY
9874
9875IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9876{
9877 RTUINT64U uSrc1 = { *puDst };
9878 RTUINT64U uSrc2 = { *puSrc };
9879 RTUINT64U uDst;
9880 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9881 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9882 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9883 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9884 *puDst = uDst.u;
9885}
9886
9887
9888IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9889{
9890 RTUINT128U uSrc1 = *puDst;
9891 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9892 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9893 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9894 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9895 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9896 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9897 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9898 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9899}
9900
9901#endif
9902
9903IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9904{
9905 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9906 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9907 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9908 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9909 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9910 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9911 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9912 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9913}
9914
9915
9916IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9917{
9918 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9919 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9920 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9921 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9922 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9923 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9924 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9925 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9926 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9927 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9928 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9929 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9930 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9931 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9932 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9933 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9934}
9935
9936
9937/*
9938 * PSRLW / VPSRLW
9939 */
9940#ifdef IEM_WITHOUT_ASSEMBLY
9941
9942IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9943{
9944 RTUINT64U uSrc1 = { *puDst };
9945 RTUINT64U uSrc2 = { *puSrc };
9946 RTUINT64U uDst;
9947
9948 if (uSrc2.au64[0] <= 15)
9949 {
9950 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9951 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9952 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9953 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9954 }
9955 else
9956 {
9957 uDst.au64[0] = 0;
9958 }
9959 *puDst = uDst.u;
9960}
9961
9962
9963IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9964{
9965 RTUINT64U uSrc1 = { *puDst };
9966 RTUINT64U uDst;
9967
9968 if (uShift <= 15)
9969 {
9970 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9971 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9972 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9973 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9974 }
9975 else
9976 {
9977 uDst.au64[0] = 0;
9978 }
9979 *puDst = uDst.u;
9980}
9981
9982
9983IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9984{
9985 RTUINT128U uSrc1 = *puDst;
9986
9987 if (puSrc->au64[0] <= 15)
9988 {
9989 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9990 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9991 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9992 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9993 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9994 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9995 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9996 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9997 }
9998 else
9999 {
10000 puDst->au64[0] = 0;
10001 puDst->au64[1] = 0;
10002 }
10003}
10004
10005IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10006{
10007 RTUINT128U uSrc1 = *puDst;
10008
10009 if (uShift <= 15)
10010 {
10011 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10012 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10013 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10014 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10015 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10016 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10017 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10018 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10019 }
10020 else
10021 {
10022 puDst->au64[0] = 0;
10023 puDst->au64[1] = 0;
10024 }
10025}
10026
10027#endif
10028
10029IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10030{
10031 RTUINT128U uSrc1 = *puSrc1;
10032
10033 if (uShift <= 15)
10034 {
10035 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10036 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10037 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10038 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10039 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10040 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10041 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10042 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10043 }
10044 else
10045 {
10046 puDst->au64[0] = 0;
10047 puDst->au64[1] = 0;
10048 }
10049}
10050
10051IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10052{
10053 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10054}
10055
10056IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10057{
10058 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10059}
10060
10061IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10062{
10063 RTUINT256U uSrc1 = *puSrc1;
10064
10065 if (uShift <= 15)
10066 {
10067 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10068 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10069 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10070 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10071 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10072 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10073 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10074 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10075 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10076 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10077 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10078 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10079 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10080 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10081 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10082 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10083 }
10084 else
10085 {
10086 puDst->au64[0] = 0;
10087 puDst->au64[1] = 0;
10088 puDst->au64[2] = 0;
10089 puDst->au64[3] = 0;
10090 }
10091}
10092
10093IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10094{
10095 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10096}
10097
10098IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10099{
10100 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10101}
10102
10103
10104/*
10105 * PSRAW / VPSRAW
10106 */
10107#ifdef IEM_WITHOUT_ASSEMBLY
10108
10109IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10110{
10111 RTUINT64U uSrc1 = { *puDst };
10112 RTUINT64U uSrc2 = { *puSrc };
10113 RTUINT64U uDst;
10114 uint8_t uShift;
10115
10116 uShift = RT_MIN(15, uSrc2.au64[0]);
10117
10118 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10119 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10120 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10121 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10122
10123 *puDst = uDst.u;
10124}
10125
10126
10127IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10128{
10129 RTUINT64U uSrc1 = { *puDst };
10130 RTUINT64U uDst;
10131
10132 uShift = RT_MIN(15, uShift);
10133
10134 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10135 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10136 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10137 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10138
10139 *puDst = uDst.u;
10140}
10141
10142
10143IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10144{
10145 RTUINT128U uSrc1 = *puDst;
10146 uint8_t uShift;
10147
10148 uShift = RT_MIN(15, puSrc->au64[0]);
10149
10150 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10151 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10152 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10153 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10154 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10155 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10156 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10157 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10158}
10159
10160IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10161{
10162 RTUINT128U uSrc1 = *puDst;
10163
10164 uShift = RT_MIN(15, uShift);
10165
10166 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10167 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10168 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10169 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10170 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10171 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10172 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10173 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10174}
10175
10176#endif
10177
10178IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10179{
10180 RTUINT128U uSrc1 = *puSrc1;
10181
10182 uShift = RT_MIN(15, uShift);
10183
10184 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10185 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10186 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10187 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10188 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10189 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10190 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10191 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10192}
10193
10194IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10195{
10196 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10197}
10198
10199IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10200{
10201 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10202}
10203
10204IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10205{
10206 RTUINT256U uSrc1 = *puSrc1;
10207
10208 uShift = RT_MIN(15, uShift);
10209
10210 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10211 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10212 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10213 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10214 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10215 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10216 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10217 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10218 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10219 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10220 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10221 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10222 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10223 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10224 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10225 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10226}
10227
10228IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10229{
10230 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10231}
10232
10233IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10234{
10235 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10236}
10237
10238
10239/*
10240 * PSLLW / VPSLLW
10241 */
10242#ifdef IEM_WITHOUT_ASSEMBLY
10243
10244IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10245{
10246 RTUINT64U uSrc1 = { *puDst };
10247 RTUINT64U uSrc2 = { *puSrc };
10248 RTUINT64U uDst;
10249
10250 if (uSrc2.au64[0] <= 15)
10251 {
10252 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10253 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10254 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10255 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10256 }
10257 else
10258 {
10259 uDst.au64[0] = 0;
10260 }
10261 *puDst = uDst.u;
10262}
10263
10264
10265IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10266{
10267 RTUINT64U uSrc1 = { *puDst };
10268 RTUINT64U uDst;
10269
10270 if (uShift <= 15)
10271 {
10272 uDst.au16[0] = uSrc1.au16[0] << uShift;
10273 uDst.au16[1] = uSrc1.au16[1] << uShift;
10274 uDst.au16[2] = uSrc1.au16[2] << uShift;
10275 uDst.au16[3] = uSrc1.au16[3] << uShift;
10276 }
10277 else
10278 {
10279 uDst.au64[0] = 0;
10280 }
10281 *puDst = uDst.u;
10282}
10283
10284
10285IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10286{
10287 RTUINT128U uSrc1 = *puDst;
10288
10289 if (puSrc->au64[0] <= 15)
10290 {
10291 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10292 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10293 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10294 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10295 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10296 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10297 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10298 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10299 }
10300 else
10301 {
10302 puDst->au64[0] = 0;
10303 puDst->au64[1] = 0;
10304 }
10305}
10306
10307IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10308{
10309 RTUINT128U uSrc1 = *puDst;
10310
10311 if (uShift <= 15)
10312 {
10313 puDst->au16[0] = uSrc1.au16[0] << uShift;
10314 puDst->au16[1] = uSrc1.au16[1] << uShift;
10315 puDst->au16[2] = uSrc1.au16[2] << uShift;
10316 puDst->au16[3] = uSrc1.au16[3] << uShift;
10317 puDst->au16[4] = uSrc1.au16[4] << uShift;
10318 puDst->au16[5] = uSrc1.au16[5] << uShift;
10319 puDst->au16[6] = uSrc1.au16[6] << uShift;
10320 puDst->au16[7] = uSrc1.au16[7] << uShift;
10321 }
10322 else
10323 {
10324 puDst->au64[0] = 0;
10325 puDst->au64[1] = 0;
10326 }
10327}
10328
10329#endif
10330
10331IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10332{
10333 RTUINT128U uSrc1 = *puSrc1;
10334
10335 if (uShift <= 15)
10336 {
10337 puDst->au16[0] = uSrc1.au16[0] << uShift;
10338 puDst->au16[1] = uSrc1.au16[1] << uShift;
10339 puDst->au16[2] = uSrc1.au16[2] << uShift;
10340 puDst->au16[3] = uSrc1.au16[3] << uShift;
10341 puDst->au16[4] = uSrc1.au16[4] << uShift;
10342 puDst->au16[5] = uSrc1.au16[5] << uShift;
10343 puDst->au16[6] = uSrc1.au16[6] << uShift;
10344 puDst->au16[7] = uSrc1.au16[7] << uShift;
10345 }
10346 else
10347 {
10348 puDst->au64[0] = 0;
10349 puDst->au64[1] = 0;
10350 }
10351}
10352
10353IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10354{
10355 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10356}
10357
10358IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10359{
10360 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10361}
10362
10363IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10364{
10365 RTUINT256U uSrc1 = *puSrc1;
10366
10367 if (uShift <= 15)
10368 {
10369 puDst->au16[0] = uSrc1.au16[0] << uShift;
10370 puDst->au16[1] = uSrc1.au16[1] << uShift;
10371 puDst->au16[2] = uSrc1.au16[2] << uShift;
10372 puDst->au16[3] = uSrc1.au16[3] << uShift;
10373 puDst->au16[4] = uSrc1.au16[4] << uShift;
10374 puDst->au16[5] = uSrc1.au16[5] << uShift;
10375 puDst->au16[6] = uSrc1.au16[6] << uShift;
10376 puDst->au16[7] = uSrc1.au16[7] << uShift;
10377 puDst->au16[8] = uSrc1.au16[8] << uShift;
10378 puDst->au16[9] = uSrc1.au16[9] << uShift;
10379 puDst->au16[10] = uSrc1.au16[10] << uShift;
10380 puDst->au16[11] = uSrc1.au16[11] << uShift;
10381 puDst->au16[12] = uSrc1.au16[12] << uShift;
10382 puDst->au16[13] = uSrc1.au16[13] << uShift;
10383 puDst->au16[14] = uSrc1.au16[14] << uShift;
10384 puDst->au16[15] = uSrc1.au16[15] << uShift;
10385 }
10386 else
10387 {
10388 puDst->au64[0] = 0;
10389 puDst->au64[1] = 0;
10390 puDst->au64[2] = 0;
10391 puDst->au64[3] = 0;
10392 }
10393}
10394
10395IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10396{
10397 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10398}
10399
10400IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10401{
10402 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10403}
10404
10405/*
10406 * PSRLD / VPSRLD
10407 */
10408#ifdef IEM_WITHOUT_ASSEMBLY
10409
10410IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10411{
10412 RTUINT64U uSrc1 = { *puDst };
10413 RTUINT64U uSrc2 = { *puSrc };
10414 RTUINT64U uDst;
10415
10416 if (uSrc2.au64[0] <= 31)
10417 {
10418 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10419 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10420 }
10421 else
10422 {
10423 uDst.au64[0] = 0;
10424 }
10425 *puDst = uDst.u;
10426}
10427
10428
10429IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10430{
10431 RTUINT64U uSrc1 = { *puDst };
10432 RTUINT64U uDst;
10433
10434 if (uShift <= 31)
10435 {
10436 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10437 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10438 }
10439 else
10440 {
10441 uDst.au64[0] = 0;
10442 }
10443 *puDst = uDst.u;
10444}
10445
10446
10447IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10448{
10449 RTUINT128U uSrc1 = *puDst;
10450
10451 if (puSrc->au64[0] <= 31)
10452 {
10453 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10454 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10455 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10456 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10457 }
10458 else
10459 {
10460 puDst->au64[0] = 0;
10461 puDst->au64[1] = 0;
10462 }
10463}
10464
10465IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10466{
10467 RTUINT128U uSrc1 = *puDst;
10468
10469 if (uShift <= 31)
10470 {
10471 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10472 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10473 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10474 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10475 }
10476 else
10477 {
10478 puDst->au64[0] = 0;
10479 puDst->au64[1] = 0;
10480 }
10481}
10482
10483#endif
10484
10485IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10486{
10487 RTUINT128U uSrc1 = *puSrc1;
10488
10489 if (uShift <= 31)
10490 {
10491 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10492 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10493 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10494 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10495 }
10496 else
10497 {
10498 puDst->au64[0] = 0;
10499 puDst->au64[1] = 0;
10500 }
10501}
10502
10503IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10504{
10505 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10506}
10507
10508IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10509{
10510 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10511}
10512
10513IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10514{
10515 RTUINT256U uSrc1 = *puSrc1;
10516
10517 if (uShift <= 31)
10518 {
10519 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10520 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10521 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10522 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10523 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10524 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10525 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10526 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10527 }
10528 else
10529 {
10530 puDst->au64[0] = 0;
10531 puDst->au64[1] = 0;
10532 puDst->au64[2] = 0;
10533 puDst->au64[3] = 0;
10534 }
10535}
10536
10537IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10538{
10539 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10540}
10541
10542IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10543{
10544 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10545}
10546
10547
10548/*
10549 * PSRAD / VPSRAD
10550 */
10551#ifdef IEM_WITHOUT_ASSEMBLY
10552
10553IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10554{
10555 RTUINT64U uSrc1 = { *puDst };
10556 RTUINT64U uSrc2 = { *puSrc };
10557 RTUINT64U uDst;
10558 uint8_t uShift;
10559
10560 uShift = RT_MIN(31, uSrc2.au64[0]);
10561
10562 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10563 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10564
10565 *puDst = uDst.u;
10566}
10567
10568
10569IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10570{
10571 RTUINT64U uSrc1 = { *puDst };
10572 RTUINT64U uDst;
10573
10574 uShift = RT_MIN(31, uShift);
10575
10576 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10577 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10578
10579 *puDst = uDst.u;
10580}
10581
10582
10583IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10584{
10585 RTUINT128U uSrc1 = *puDst;
10586 uint8_t uShift;
10587
10588 uShift = RT_MIN(31, puSrc->au64[0]);
10589
10590 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10591 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10592 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10593 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10594}
10595
10596IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10597{
10598 RTUINT128U uSrc1 = *puDst;
10599
10600 uShift = RT_MIN(31, uShift);
10601
10602 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10603 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10604 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10605 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10606}
10607
10608#endif
10609
10610IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10611{
10612 RTUINT128U uSrc1 = *puSrc1;
10613
10614 uShift = RT_MIN(31, uShift);
10615
10616 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10617 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10618 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10619 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10620}
10621
10622IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10623{
10624 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10625}
10626
10627IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10628{
10629 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10630}
10631
10632IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10633{
10634 RTUINT256U uSrc1 = *puSrc1;
10635
10636 uShift = RT_MIN(31, uShift);
10637
10638 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10639 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10640 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10641 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10642 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10643 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10644 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10645 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10646}
10647
10648IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10649{
10650 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10651}
10652
10653IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10654{
10655 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10656}
10657
10658
10659/*
10660 * PSLLD / VPSLLD
10661 */
10662#ifdef IEM_WITHOUT_ASSEMBLY
10663
10664IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10665{
10666 RTUINT64U uSrc1 = { *puDst };
10667 RTUINT64U uSrc2 = { *puSrc };
10668 RTUINT64U uDst;
10669
10670 if (uSrc2.au64[0] <= 31)
10671 {
10672 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10673 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10674 }
10675 else
10676 {
10677 uDst.au64[0] = 0;
10678 }
10679 *puDst = uDst.u;
10680}
10681
10682
10683IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10684{
10685 RTUINT64U uSrc1 = { *puDst };
10686 RTUINT64U uDst;
10687
10688 if (uShift <= 31)
10689 {
10690 uDst.au32[0] = uSrc1.au32[0] << uShift;
10691 uDst.au32[1] = uSrc1.au32[1] << uShift;
10692 }
10693 else
10694 {
10695 uDst.au64[0] = 0;
10696 }
10697 *puDst = uDst.u;
10698}
10699
10700
10701IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10702{
10703 RTUINT128U uSrc1 = *puDst;
10704
10705 if (puSrc->au64[0] <= 31)
10706 {
10707 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10708 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10709 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10710 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10711 }
10712 else
10713 {
10714 puDst->au64[0] = 0;
10715 puDst->au64[1] = 0;
10716 }
10717}
10718
10719IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10720{
10721 RTUINT128U uSrc1 = *puDst;
10722
10723 if (uShift <= 31)
10724 {
10725 puDst->au32[0] = uSrc1.au32[0] << uShift;
10726 puDst->au32[1] = uSrc1.au32[1] << uShift;
10727 puDst->au32[2] = uSrc1.au32[2] << uShift;
10728 puDst->au32[3] = uSrc1.au32[3] << uShift;
10729 }
10730 else
10731 {
10732 puDst->au64[0] = 0;
10733 puDst->au64[1] = 0;
10734 }
10735}
10736
10737#endif
10738
10739IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10740{
10741 RTUINT128U uSrc1 = *puSrc1;
10742
10743 if (uShift <= 31)
10744 {
10745 puDst->au32[0] = uSrc1.au32[0] << uShift;
10746 puDst->au32[1] = uSrc1.au32[1] << uShift;
10747 puDst->au32[2] = uSrc1.au32[2] << uShift;
10748 puDst->au32[3] = uSrc1.au32[3] << uShift;
10749 }
10750 else
10751 {
10752 puDst->au64[0] = 0;
10753 puDst->au64[1] = 0;
10754 }
10755}
10756
10757IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10758{
10759 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10760}
10761
10762IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10763{
10764 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10765}
10766
10767IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10768{
10769 RTUINT256U uSrc1 = *puSrc1;
10770
10771 if (uShift <= 31)
10772 {
10773 puDst->au32[0] = uSrc1.au32[0] << uShift;
10774 puDst->au32[1] = uSrc1.au32[1] << uShift;
10775 puDst->au32[2] = uSrc1.au32[2] << uShift;
10776 puDst->au32[3] = uSrc1.au32[3] << uShift;
10777 puDst->au32[4] = uSrc1.au32[4] << uShift;
10778 puDst->au32[5] = uSrc1.au32[5] << uShift;
10779 puDst->au32[6] = uSrc1.au32[6] << uShift;
10780 puDst->au32[7] = uSrc1.au32[7] << uShift;
10781 }
10782 else
10783 {
10784 puDst->au64[0] = 0;
10785 puDst->au64[1] = 0;
10786 puDst->au64[2] = 0;
10787 puDst->au64[3] = 0;
10788 }
10789}
10790
10791IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10792{
10793 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10794}
10795
10796IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10797{
10798 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10799}
10800
10801
10802/*
10803 * PSRLQ / VPSRLQ
10804 */
10805#ifdef IEM_WITHOUT_ASSEMBLY
10806
10807IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10808{
10809 RTUINT64U uSrc1 = { *puDst };
10810 RTUINT64U uSrc2 = { *puSrc };
10811 RTUINT64U uDst;
10812
10813 if (uSrc2.au64[0] <= 63)
10814 {
10815 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10816 }
10817 else
10818 {
10819 uDst.au64[0] = 0;
10820 }
10821 *puDst = uDst.u;
10822}
10823
10824
10825IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10826{
10827 RTUINT64U uSrc1 = { *puDst };
10828 RTUINT64U uDst;
10829
10830 if (uShift <= 63)
10831 {
10832 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10833 }
10834 else
10835 {
10836 uDst.au64[0] = 0;
10837 }
10838 *puDst = uDst.u;
10839}
10840
10841
10842IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10843{
10844 RTUINT128U uSrc1 = *puDst;
10845
10846 if (puSrc->au64[0] <= 63)
10847 {
10848 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10849 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10850 }
10851 else
10852 {
10853 puDst->au64[0] = 0;
10854 puDst->au64[1] = 0;
10855 }
10856}
10857
10858IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10859{
10860 RTUINT128U uSrc1 = *puDst;
10861
10862 if (uShift <= 63)
10863 {
10864 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10865 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10866 }
10867 else
10868 {
10869 puDst->au64[0] = 0;
10870 puDst->au64[1] = 0;
10871 }
10872}
10873
10874#endif
10875
10876IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10877{
10878 RTUINT128U uSrc1 = *puSrc1;
10879
10880 if (uShift <= 63)
10881 {
10882 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10883 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10884 }
10885 else
10886 {
10887 puDst->au64[0] = 0;
10888 puDst->au64[1] = 0;
10889 }
10890}
10891
10892IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10893{
10894 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10895}
10896
10897IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10898{
10899 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10900}
10901
10902IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10903{
10904 RTUINT256U uSrc1 = *puSrc1;
10905
10906 if (uShift <= 63)
10907 {
10908 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10909 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10910 puDst->au64[2] = uSrc1.au64[2] >> uShift;
10911 puDst->au64[3] = uSrc1.au64[3] >> uShift;
10912 }
10913 else
10914 {
10915 puDst->au64[0] = 0;
10916 puDst->au64[1] = 0;
10917 puDst->au64[2] = 0;
10918 puDst->au64[3] = 0;
10919 }
10920}
10921
10922IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10923{
10924 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10925}
10926
10927IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10928{
10929 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
10930}
10931
10932
10933/*
10934 * PSLLQ / VPSLLQ
10935 */
10936#ifdef IEM_WITHOUT_ASSEMBLY
10937
10938IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10939{
10940 RTUINT64U uSrc1 = { *puDst };
10941 RTUINT64U uSrc2 = { *puSrc };
10942 RTUINT64U uDst;
10943
10944 if (uSrc2.au64[0] <= 63)
10945 {
10946 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10947 }
10948 else
10949 {
10950 uDst.au64[0] = 0;
10951 }
10952 *puDst = uDst.u;
10953}
10954
10955
10956IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10957{
10958 RTUINT64U uSrc1 = { *puDst };
10959 RTUINT64U uDst;
10960
10961 if (uShift <= 63)
10962 {
10963 uDst.au64[0] = uSrc1.au64[0] << uShift;
10964 }
10965 else
10966 {
10967 uDst.au64[0] = 0;
10968 }
10969 *puDst = uDst.u;
10970}
10971
10972
10973IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10974{
10975 RTUINT128U uSrc1 = *puDst;
10976
10977 if (puSrc->au64[0] <= 63)
10978 {
10979 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10980 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10981 }
10982 else
10983 {
10984 puDst->au64[0] = 0;
10985 puDst->au64[1] = 0;
10986 }
10987}
10988
10989IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10990{
10991 RTUINT128U uSrc1 = *puDst;
10992
10993 if (uShift <= 63)
10994 {
10995 puDst->au64[0] = uSrc1.au64[0] << uShift;
10996 puDst->au64[1] = uSrc1.au64[1] << uShift;
10997 }
10998 else
10999 {
11000 puDst->au64[0] = 0;
11001 puDst->au64[1] = 0;
11002 }
11003}
11004
11005#endif
11006
11007IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11008{
11009 RTUINT128U uSrc1 = *puSrc1;
11010
11011 if (uShift <= 63)
11012 {
11013 puDst->au64[0] = uSrc1.au64[0] << uShift;
11014 puDst->au64[1] = uSrc1.au64[1] << uShift;
11015 }
11016 else
11017 {
11018 puDst->au64[0] = 0;
11019 puDst->au64[1] = 0;
11020 }
11021}
11022
11023IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11024{
11025 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11026}
11027
11028IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11029{
11030 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11031}
11032
11033IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11034{
11035 RTUINT256U uSrc1 = *puSrc1;
11036
11037 if (uShift <= 63)
11038 {
11039 puDst->au64[0] = uSrc1.au64[0] << uShift;
11040 puDst->au64[1] = uSrc1.au64[1] << uShift;
11041 puDst->au64[2] = uSrc1.au64[2] << uShift;
11042 puDst->au64[3] = uSrc1.au64[3] << uShift;
11043 }
11044 else
11045 {
11046 puDst->au64[0] = 0;
11047 puDst->au64[1] = 0;
11048 puDst->au64[2] = 0;
11049 puDst->au64[3] = 0;
11050 }
11051}
11052
11053IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11054{
11055 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11056}
11057
11058IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11059{
11060 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11061}
11062
11063
11064/*
11065 * PSRLDQ / VPSRLDQ
11066 */
11067#ifdef IEM_WITHOUT_ASSEMBLY
11068
11069IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11070{
11071 if (uShift < 16)
11072 {
11073 RTUINT128U uSrc1 = *puDst;
11074 int i;
11075
11076 for (i = 0; i < 16 - uShift; ++i)
11077 puDst->au8[i] = uSrc1.au8[i + uShift];
11078 for (i = 16 - uShift; i < 16; ++i)
11079 puDst->au8[i] = 0;
11080 }
11081 else
11082 {
11083 puDst->au64[0] = 0;
11084 puDst->au64[1] = 0;
11085 }
11086}
11087
11088IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11089{
11090 if (uShift < 16)
11091 {
11092 RTUINT128U uSrc1 = *puSrc;
11093 int i;
11094
11095 for (i = 0; i < 16 - uShift; ++i)
11096 puDst->au8[i] = uSrc1.au8[i + uShift];
11097 for (i = 16 - uShift; i < 16; ++i)
11098 puDst->au8[i] = 0;
11099 }
11100 else
11101 {
11102 puDst->au64[0] = 0;
11103 puDst->au64[1] = 0;
11104 }
11105}
11106
11107IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11108{
11109 iemAImpl_vpsrldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11110 iemAImpl_vpsrldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11111}
11112#endif
11113
11114IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11115{
11116 if (uShift < 16)
11117 {
11118 RTUINT128U uSrc1 = *puSrc;
11119 int i;
11120
11121 for (i = 0; i < 16 - uShift; ++i)
11122 puDst->au8[i] = uSrc1.au8[i + uShift];
11123 for (i = 16 - uShift; i < 16; ++i)
11124 puDst->au8[i] = 0;
11125 }
11126 else
11127 {
11128 puDst->au64[0] = 0;
11129 puDst->au64[1] = 0;
11130 }
11131}
11132
11133IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11134{
11135 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11136 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11137}
11138
11139
11140/*
11141 * PSLLDQ / VPSLLDQ
11142 */
11143#ifdef IEM_WITHOUT_ASSEMBLY
11144
11145IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11146{
11147 if (uShift < 16)
11148 {
11149 RTUINT128U uSrc1 = *puDst;
11150 int i;
11151
11152 for (i = 0; i < uShift; ++i)
11153 puDst->au8[i] = 0;
11154 for (i = uShift; i < 16; ++i)
11155 puDst->au8[i] = uSrc1.au8[i - uShift];
11156 }
11157 else
11158 {
11159 puDst->au64[0] = 0;
11160 puDst->au64[1] = 0;
11161 }
11162}
11163
11164IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11165{
11166 if (uShift < 16)
11167 {
11168 RTUINT128U uSrc1 = *puSrc;
11169 int i;
11170
11171 for (i = 0; i < uShift; ++i)
11172 puDst->au8[i] = 0;
11173 for (i = uShift; i < 16; ++i)
11174 puDst->au8[i] = uSrc1.au8[i - uShift];
11175 }
11176 else
11177 {
11178 puDst->au64[0] = 0;
11179 puDst->au64[1] = 0;
11180 }
11181}
11182
11183IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11184{
11185 iemAImpl_vpslldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11186 iemAImpl_vpslldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11187}
11188
11189#endif
11190
11191IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11192{
11193 if (uShift < 16)
11194 {
11195 RTUINT128U uSrc1 = *puSrc;
11196 int i;
11197
11198 for (i = 0; i < uShift; ++i)
11199 puDst->au8[i] = 0;
11200 for (i = uShift; i < 16; ++i)
11201 puDst->au8[i] = uSrc1.au8[i - uShift];
11202 }
11203 else
11204 {
11205 puDst->au64[0] = 0;
11206 puDst->au64[1] = 0;
11207 }
11208}
11209
11210IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11211{
11212 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11213 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11214}
11215
11216
11217/*
11218 * VPSRLVD
11219 */
11220IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11221{
11222 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11223 {
11224 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11225 }
11226}
11227
11228IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11229{
11230 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11231 {
11232 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11233 }
11234}
11235
11236
11237/*
11238 * VPSRAVD
11239 */
11240IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11241{
11242 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11243 {
11244 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11245 }
11246}
11247
11248IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11249{
11250 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11251 {
11252 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11253 }
11254}
11255
11256
11257/*
11258 * VPSLLVD
11259 */
11260IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11261{
11262 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11263 {
11264 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11265 }
11266}
11267
11268IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11269{
11270 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11271 {
11272 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11273 }
11274}
11275
11276
11277/*
11278 * VPSRLVQ
11279 */
11280IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11281{
11282 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11283 {
11284 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11285 }
11286}
11287
11288IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11289{
11290 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11291 {
11292 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11293 }
11294}
11295
11296
11297/*
11298 * VPSLLVQ
11299 */
11300IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11301{
11302 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11303 {
11304 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11305 }
11306}
11307
11308IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11309{
11310 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11311 {
11312 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11313 }
11314}
11315
11316
11317/*
11318 * PMADDWD / VPMADDWD
11319 */
11320#ifdef IEM_WITHOUT_ASSEMBLY
11321
11322IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11323{
11324 RTUINT64U uSrc1 = { *puDst };
11325 RTUINT64U uSrc2 = { *puSrc };
11326 RTUINT64U uDst;
11327
11328 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11329 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11330 *puDst = uDst.u;
11331}
11332
11333
11334IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11335{
11336 RTUINT128U uSrc1 = *puDst;
11337
11338 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11339 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11340 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11341 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11342}
11343
11344#endif
11345
11346
11347IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
11348{
11349 RTUINT64U uSrc1 = { *puDst };
11350 RTUINT64U uSrc2 = { *puSrc };
11351 RTUINT64U uDst;
11352
11353 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11354 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11355 *puDst = uDst.u;
11356}
11357
11358
11359IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11360{
11361 RTUINT128U uSrc1 = *puDst;
11362
11363 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11364 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11365 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11366 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11367}
11368
11369
11370IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11371{
11372 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11373 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11374 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11375 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11376}
11377
11378
11379IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11380{
11381 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11382 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11383 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11384 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11385 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11386 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11387 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11388 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11389}
11390
11391
11392/*
11393 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11394 */
11395#ifdef IEM_WITHOUT_ASSEMBLY
11396
11397IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11398{
11399 RTUINT64U uSrc1 = { *puDst };
11400 RTUINT64U uSrc2 = { *puSrc };
11401 RTUINT64U uDst;
11402
11403 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11404 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11405 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11406 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11407 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11408 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11409 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11410 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11411 *puDst = uDst.u;
11412}
11413
11414
11415IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11416{
11417 RTUINT128U uSrc1 = *puDst;
11418
11419 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11420 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11421 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11422 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11423 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11424 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11425 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11426 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11427 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11428 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11429 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11430 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11431 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11432 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11433 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11434 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11435}
11436
11437#endif
11438
11439
11440IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11441{
11442 RTUINT128U uSrc1 = *puDst;
11443
11444 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11445 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11446 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11447 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11448 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11449 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11450 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11451 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11452}
11453
11454
11455IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11456{
11457 RTUINT128U uSrc1 = *puDst;
11458
11459 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11460 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11461 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11462 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11463}
11464
11465
11466IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11467{
11468 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11469 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11470 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11471 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11472 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11473 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11474 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11475 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11476 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11477 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11478 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11479 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11480 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11481 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11482 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11483 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11484}
11485
11486
11487IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11488{
11489 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11490 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11491 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11492 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11493 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11494 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11495 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11496 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11497 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11498 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11499 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11500 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11501 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11502 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11503 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11504 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11505 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11506 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11507 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11508 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11509 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11510 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11511 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11512 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11513 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11514 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11515 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11516 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11517 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11518 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11519 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11520 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11521}
11522
11523
11524IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11525{
11526 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11527 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11528 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11529 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11530 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11531 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11532 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11533 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11534}
11535
11536
11537IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11538{
11539 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11540 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11541 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11542 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11543 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11544 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11545 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11546 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11547 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11548 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11549 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11550 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11551 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11552 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11553 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11554 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11555}
11556
11557
11558IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11559{
11560 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11561 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11562 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11563 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11564}
11565
11566
11567IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11568{
11569 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11570 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11571 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11572 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11573 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11574 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11575 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11576 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11577}
11578
11579
11580/*
11581 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11582 */
11583#ifdef IEM_WITHOUT_ASSEMBLY
11584
11585IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11586{
11587 RTUINT64U uSrc1 = { *puDst };
11588 RTUINT64U uSrc2 = { *puSrc };
11589 RTUINT64U uDst;
11590
11591 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11592 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11593 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11594 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11595 *puDst = uDst.u;
11596}
11597
11598
11599IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11600{
11601 RTUINT128U uSrc1 = *puDst;
11602
11603 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11604 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11605 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11606 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11607 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11608 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11609 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11610 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11611}
11612
11613#endif
11614
11615IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11616{
11617 RTUINT128U uSrc1 = *puDst;
11618
11619 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11620 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11621 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11622 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11623 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11624 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11625 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11626 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11627 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11628 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11629 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11630 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11631 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11632 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11633 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11634 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11635}
11636
11637
11638IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11639{
11640 RTUINT128U uSrc1 = *puDst;
11641
11642 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11643 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11644 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11645 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11646}
11647
11648
11649IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11650{
11651 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11652 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11653 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11654 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11655 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11656 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11657 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11658 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11659 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11660 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11661 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11662 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11663 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11664 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11665 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11666 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11667}
11668
11669
11670IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11671{
11672 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11673 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11674 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11675 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11676 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11677 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11678 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11679 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11680 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11681 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11682 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11683 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11684 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11685 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11686 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11687 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11688 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11689 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11690 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11691 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11692 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11693 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11694 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11695 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11696 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11697 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11698 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11699 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11700 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11701 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11702 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11703 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11704}
11705
11706
11707IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11708{
11709 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11710 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11711 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11712 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11713 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11714 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11715 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11716 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11717}
11718
11719
11720IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11721{
11722 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11723 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11724 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11725 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11726 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11727 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11728 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11729 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11730 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11731 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11732 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11733 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11734 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11735 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11736 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11737 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11738}
11739
11740
11741IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11742{
11743 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11744 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11745 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11746 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11747}
11748
11749
11750IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11751{
11752 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11753 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11754 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11755 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11756 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11757 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11758 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11759 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11760}
11761
11762
11763/*
11764 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11765 */
11766#ifdef IEM_WITHOUT_ASSEMBLY
11767
11768IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11769{
11770 RTUINT64U uSrc1 = { *puDst };
11771 RTUINT64U uSrc2 = { *puSrc };
11772 RTUINT64U uDst;
11773
11774 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11775 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11776 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11777 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11778 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11779 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11780 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11781 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11782 *puDst = uDst.u;
11783}
11784
11785
11786IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11787{
11788 RTUINT128U uSrc1 = *puDst;
11789
11790 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11791 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11792 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11793 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11794 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11795 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11796 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11797 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11798 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11799 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11800 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11801 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11802 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11803 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11804 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11805 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11806}
11807
11808#endif
11809
11810IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11811{
11812 RTUINT128U uSrc1 = *puDst;
11813
11814 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11815 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11816 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11817 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11818 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11819 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11820 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11821 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11822}
11823
11824
11825IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11826{
11827 RTUINT128U uSrc1 = *puDst;
11828
11829 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11830 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11831 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11832 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11833}
11834
11835
11836IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11837{
11838 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11839 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11840 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11841 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11842 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11843 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11844 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11845 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11846 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11847 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11848 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11849 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11850 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11851 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11852 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11853 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11854}
11855
11856
11857IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11858{
11859 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11860 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11861 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11862 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11863 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11864 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11865 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11866 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11867 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11868 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11869 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11870 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11871 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11872 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11873 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11874 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11875 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11876 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11877 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11878 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11879 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11880 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11881 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11882 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11883 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11884 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11885 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11886 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11887 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11888 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11889 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11890 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11891}
11892
11893
11894IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11895{
11896 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11897 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11898 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11899 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11900 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11901 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11902 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11903 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11904}
11905
11906
11907IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11908{
11909 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11910 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11911 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11912 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11913 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11914 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11915 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11916 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11917 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11918 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11919 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11920 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11921 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11922 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11923 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11924 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11925}
11926
11927
11928IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11929{
11930 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11931 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11932 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11933 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11934}
11935
11936
11937IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11938{
11939 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11940 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11941 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11942 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11943 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11944 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11945 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11946 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11947}
11948
11949
11950/*
11951 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11952 */
11953#ifdef IEM_WITHOUT_ASSEMBLY
11954
11955IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11956{
11957 RTUINT64U uSrc1 = { *puDst };
11958 RTUINT64U uSrc2 = { *puSrc };
11959 RTUINT64U uDst;
11960
11961 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11962 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11963 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11964 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11965 *puDst = uDst.u;
11966}
11967
11968
11969IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11970{
11971 RTUINT128U uSrc1 = *puDst;
11972
11973 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11974 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11975 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11976 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11977 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11978 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11979 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11980 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11981}
11982
11983#endif
11984
11985IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11986{
11987 RTUINT128U uSrc1 = *puDst;
11988
11989 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11990 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11991 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11992 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11993 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11994 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11995 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11996 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11997 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11998 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11999 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
12000 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
12001 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
12002 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
12003 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
12004 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
12005}
12006
12007
12008IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12009{
12010 RTUINT128U uSrc1 = *puDst;
12011
12012 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
12013 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
12014 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
12015 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
12016}
12017
12018
12019IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12020{
12021 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12022 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12023 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12024 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12025 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12026 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12027 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12028 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12029 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12030 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12031 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12032 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12033 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12034 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12035 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12036 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12037}
12038
12039
12040IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12041{
12042 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12043 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12044 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12045 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12046 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12047 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12048 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12049 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12050 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12051 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12052 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12053 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12054 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12055 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12056 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12057 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12058 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
12059 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
12060 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
12061 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
12062 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
12063 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
12064 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
12065 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
12066 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
12067 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
12068 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
12069 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
12070 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
12071 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
12072 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
12073 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
12074}
12075
12076
12077IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12078{
12079 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12080 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12081 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12082 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12083 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12084 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12085 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12086 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12087}
12088
12089
12090IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12091{
12092 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12093 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12094 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12095 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12096 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12097 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12098 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12099 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12100 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12101 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12102 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12103 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12104 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12105 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12106 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12107 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12108}
12109
12110
12111IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12112{
12113 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12114 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12115 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12116 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12117}
12118
12119
12120IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12121{
12122 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12123 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12124 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12125 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12126 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12127 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12128 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12129 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12130}
12131
12132
12133/*
12134 * PAVGB / VPAVGB / PAVGW / VPAVGW
12135 */
12136#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12137#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12138
12139#ifdef IEM_WITHOUT_ASSEMBLY
12140
12141IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12142{
12143 RTUINT64U uSrc1 = { *puDst };
12144 RTUINT64U uSrc2 = { *puSrc };
12145 RTUINT64U uDst;
12146
12147 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12148 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12149 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12150 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12151 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12152 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12153 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12154 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12155 *puDst = uDst.u;
12156}
12157
12158
12159IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12160{
12161 RTUINT128U uSrc1 = *puDst;
12162
12163 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12164 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12165 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12166 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12167 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12168 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12169 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12170 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12171 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12172 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12173 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12174 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12175 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12176 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12177 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12178 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12179}
12180
12181
12182IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12183{
12184 RTUINT64U uSrc1 = { *puDst };
12185 RTUINT64U uSrc2 = { *puSrc };
12186 RTUINT64U uDst;
12187
12188 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12189 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12190 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12191 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12192 *puDst = uDst.u;
12193}
12194
12195
12196IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12197{
12198 RTUINT128U uSrc1 = *puDst;
12199
12200 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12201 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12202 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12203 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12204 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12205 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12206 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12207 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12208}
12209
12210#endif
12211
12212IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12213{
12214 RTUINT128U uSrc1 = *puDst;
12215
12216 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12217 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12218 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12219 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12220 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12221 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12222 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12223 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12224 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12225 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12226 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12227 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12228 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12229 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12230 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12231 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12232}
12233
12234
12235IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12236{
12237 RTUINT128U uSrc1 = *puDst;
12238
12239 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12240 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12241 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12242 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12243 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12244 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12245 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12246 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12247 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12248 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12249 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12250 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12251 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12252 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12253 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12254 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12255}
12256
12257
12258IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12259{
12260 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12261 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12262 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12263 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12264 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12265 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12266 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12267 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12268 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12269 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12270 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12271 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12272 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12273 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12274 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12275 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12276}
12277
12278
12279IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12280{
12281 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12282 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12283 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12284 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12285 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12286 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12287 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12288 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12289 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12290 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12291 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12292 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12293 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12294 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12295 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12296 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12297 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12298 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12299 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12300 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12301 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12302 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12303 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12304 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12305 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12306 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12307 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12308 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12309 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12310 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12311 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12312 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12313}
12314
12315
12316IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12317{
12318 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12319 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12320 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12321 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12322 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12323 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12324 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12325 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12326}
12327
12328
12329IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12330{
12331 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12332 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12333 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12334 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12335 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12336 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12337 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12338 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12339 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12340 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12341 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12342 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12343 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12344 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12345 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12346 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12347}
12348
12349#undef PAVGB_EXEC
12350#undef PAVGW_EXEC
12351
12352
12353/*
12354 * PMOVMSKB / VPMOVMSKB
12355 */
12356#ifdef IEM_WITHOUT_ASSEMBLY
12357
12358IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12359{
12360 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12361 uint64_t const uSrc = *pu64Src;
12362 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12363 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12364 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12365 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12366 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12367 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12368 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12369 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12370}
12371
12372
12373IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12374{
12375 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12376 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12377 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12378 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12379 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12380 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12381 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12382 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12383 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12384 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12385 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12386 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12387 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12388 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12389 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12390 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12391 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12392 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12393 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12394}
12395
12396#endif
12397
12398IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12399{
12400 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12401 uint64_t const uSrc0 = puSrc->QWords.qw0;
12402 uint64_t const uSrc1 = puSrc->QWords.qw1;
12403 uint64_t const uSrc2 = puSrc->QWords.qw2;
12404 uint64_t const uSrc3 = puSrc->QWords.qw3;
12405 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12406 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12407 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12408 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12409 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12410 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12411 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12412 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12413 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12414 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12415 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12416 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12417 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12418 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12419 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12420 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12421 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12422 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12423 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12424 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12425 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12426 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12427 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12428 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12429 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12430 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12431 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12432 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12433 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12434 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12435 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12436 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12437}
12438
12439
12440/*
12441 * [V]PSHUFB
12442 */
12443
12444IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
12445{
12446 RTUINT64U const uSrc = { *puSrc };
12447 RTUINT64U const uDstIn = { *puDst };
12448 ASMCompilerBarrier();
12449 RTUINT64U uDstOut = { 0 };
12450 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12451 {
12452 uint8_t idxSrc = uSrc.au8[iByte];
12453 if (!(idxSrc & 0x80))
12454 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12455 }
12456 *puDst = uDstOut.u;
12457}
12458
12459
12460IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12461{
12462 RTUINT128U const uSrc = *puSrc;
12463 RTUINT128U const uDstIn = *puDst;
12464 ASMCompilerBarrier();
12465 puDst->au64[0] = 0;
12466 puDst->au64[1] = 0;
12467 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12468 {
12469 uint8_t idxSrc = uSrc.au8[iByte];
12470 if (!(idxSrc & 0x80))
12471 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12472 }
12473}
12474
12475
12476IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12477{
12478 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12479 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12480 ASMCompilerBarrier();
12481 puDst->au64[0] = 0;
12482 puDst->au64[1] = 0;
12483 for (unsigned iByte = 0; iByte < 16; iByte++)
12484 {
12485 uint8_t idxSrc = uSrc2.au8[iByte];
12486 if (!(idxSrc & 0x80))
12487 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12488 }
12489}
12490
12491
12492IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12493{
12494 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12495 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12496 ASMCompilerBarrier();
12497 puDst->au64[0] = 0;
12498 puDst->au64[1] = 0;
12499 puDst->au64[2] = 0;
12500 puDst->au64[3] = 0;
12501 for (unsigned iByte = 0; iByte < 16; iByte++)
12502 {
12503 uint8_t idxSrc = uSrc2.au8[iByte];
12504 if (!(idxSrc & 0x80))
12505 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12506 }
12507 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12508 {
12509 uint8_t idxSrc = uSrc2.au8[iByte];
12510 if (!(idxSrc & 0x80))
12511 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12512 }
12513}
12514
12515
12516/*
12517 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12518 */
12519#ifdef IEM_WITHOUT_ASSEMBLY
12520
12521IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12522{
12523 uint64_t const uSrc = *puSrc;
12524 ASMCompilerBarrier();
12525 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12526 uSrc >> (((bEvil >> 2) & 3) * 16),
12527 uSrc >> (((bEvil >> 4) & 3) * 16),
12528 uSrc >> (((bEvil >> 6) & 3) * 16));
12529}
12530
12531
12532IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12533{
12534 puDst->QWords.qw0 = puSrc->QWords.qw0;
12535 uint64_t const uSrc = puSrc->QWords.qw1;
12536 ASMCompilerBarrier();
12537 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12538 uSrc >> (((bEvil >> 2) & 3) * 16),
12539 uSrc >> (((bEvil >> 4) & 3) * 16),
12540 uSrc >> (((bEvil >> 6) & 3) * 16));
12541}
12542
12543#endif
12544
12545IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12546{
12547 puDst->QWords.qw0 = puSrc->QWords.qw0;
12548 uint64_t const uSrc1 = puSrc->QWords.qw1;
12549 puDst->QWords.qw2 = puSrc->QWords.qw2;
12550 uint64_t const uSrc3 = puSrc->QWords.qw3;
12551 ASMCompilerBarrier();
12552 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12553 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12554 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12555 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12556 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12557 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12558 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12559 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12560}
12561
12562#ifdef IEM_WITHOUT_ASSEMBLY
12563IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12564{
12565 puDst->QWords.qw1 = puSrc->QWords.qw1;
12566 uint64_t const uSrc = puSrc->QWords.qw0;
12567 ASMCompilerBarrier();
12568 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12569 uSrc >> (((bEvil >> 2) & 3) * 16),
12570 uSrc >> (((bEvil >> 4) & 3) * 16),
12571 uSrc >> (((bEvil >> 6) & 3) * 16));
12572
12573}
12574#endif
12575
12576
12577IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12578{
12579 puDst->QWords.qw3 = puSrc->QWords.qw3;
12580 uint64_t const uSrc2 = puSrc->QWords.qw2;
12581 puDst->QWords.qw1 = puSrc->QWords.qw1;
12582 uint64_t const uSrc0 = puSrc->QWords.qw0;
12583 ASMCompilerBarrier();
12584 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12585 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12586 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12587 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12588 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12589 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12590 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12591 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12592
12593}
12594
12595
12596#ifdef IEM_WITHOUT_ASSEMBLY
12597IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12598{
12599 RTUINT128U const uSrc = *puSrc;
12600 ASMCompilerBarrier();
12601 puDst->au32[0] = uSrc.au32[bEvil & 3];
12602 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12603 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12604 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12605}
12606#endif
12607
12608
12609IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12610{
12611 RTUINT256U const uSrc = *puSrc;
12612 ASMCompilerBarrier();
12613 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12614 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12615 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12616 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12617 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12618 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12619 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12620 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12621}
12622
12623
12624/*
12625 * PUNPCKHBW - high bytes -> words
12626 */
12627#ifdef IEM_WITHOUT_ASSEMBLY
12628
12629IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12630{
12631 RTUINT64U const uSrc2 = { *puSrc };
12632 RTUINT64U const uSrc1 = { *puDst };
12633 ASMCompilerBarrier();
12634 RTUINT64U uDstOut;
12635 uDstOut.au8[0] = uSrc1.au8[4];
12636 uDstOut.au8[1] = uSrc2.au8[4];
12637 uDstOut.au8[2] = uSrc1.au8[5];
12638 uDstOut.au8[3] = uSrc2.au8[5];
12639 uDstOut.au8[4] = uSrc1.au8[6];
12640 uDstOut.au8[5] = uSrc2.au8[6];
12641 uDstOut.au8[6] = uSrc1.au8[7];
12642 uDstOut.au8[7] = uSrc2.au8[7];
12643 *puDst = uDstOut.u;
12644}
12645
12646
12647IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12648{
12649 RTUINT128U const uSrc2 = *puSrc;
12650 RTUINT128U const uSrc1 = *puDst;
12651 ASMCompilerBarrier();
12652 RTUINT128U uDstOut;
12653 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12654 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12655 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12656 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12657 uDstOut.au8[ 4] = uSrc1.au8[10];
12658 uDstOut.au8[ 5] = uSrc2.au8[10];
12659 uDstOut.au8[ 6] = uSrc1.au8[11];
12660 uDstOut.au8[ 7] = uSrc2.au8[11];
12661 uDstOut.au8[ 8] = uSrc1.au8[12];
12662 uDstOut.au8[ 9] = uSrc2.au8[12];
12663 uDstOut.au8[10] = uSrc1.au8[13];
12664 uDstOut.au8[11] = uSrc2.au8[13];
12665 uDstOut.au8[12] = uSrc1.au8[14];
12666 uDstOut.au8[13] = uSrc2.au8[14];
12667 uDstOut.au8[14] = uSrc1.au8[15];
12668 uDstOut.au8[15] = uSrc2.au8[15];
12669 *puDst = uDstOut;
12670}
12671
12672#endif
12673
12674IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12675{
12676 RTUINT128U const uSrc2 = *puSrc2;
12677 RTUINT128U const uSrc1 = *puSrc1;
12678 ASMCompilerBarrier();
12679 RTUINT128U uDstOut;
12680 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12681 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12682 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12683 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12684 uDstOut.au8[ 4] = uSrc1.au8[10];
12685 uDstOut.au8[ 5] = uSrc2.au8[10];
12686 uDstOut.au8[ 6] = uSrc1.au8[11];
12687 uDstOut.au8[ 7] = uSrc2.au8[11];
12688 uDstOut.au8[ 8] = uSrc1.au8[12];
12689 uDstOut.au8[ 9] = uSrc2.au8[12];
12690 uDstOut.au8[10] = uSrc1.au8[13];
12691 uDstOut.au8[11] = uSrc2.au8[13];
12692 uDstOut.au8[12] = uSrc1.au8[14];
12693 uDstOut.au8[13] = uSrc2.au8[14];
12694 uDstOut.au8[14] = uSrc1.au8[15];
12695 uDstOut.au8[15] = uSrc2.au8[15];
12696 *puDst = uDstOut;
12697}
12698
12699
12700IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12701{
12702 RTUINT256U const uSrc2 = *puSrc2;
12703 RTUINT256U const uSrc1 = *puSrc1;
12704 ASMCompilerBarrier();
12705 RTUINT256U uDstOut;
12706 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12707 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12708 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12709 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12710 uDstOut.au8[ 4] = uSrc1.au8[10];
12711 uDstOut.au8[ 5] = uSrc2.au8[10];
12712 uDstOut.au8[ 6] = uSrc1.au8[11];
12713 uDstOut.au8[ 7] = uSrc2.au8[11];
12714 uDstOut.au8[ 8] = uSrc1.au8[12];
12715 uDstOut.au8[ 9] = uSrc2.au8[12];
12716 uDstOut.au8[10] = uSrc1.au8[13];
12717 uDstOut.au8[11] = uSrc2.au8[13];
12718 uDstOut.au8[12] = uSrc1.au8[14];
12719 uDstOut.au8[13] = uSrc2.au8[14];
12720 uDstOut.au8[14] = uSrc1.au8[15];
12721 uDstOut.au8[15] = uSrc2.au8[15];
12722 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12723 uDstOut.au8[16] = uSrc1.au8[24];
12724 uDstOut.au8[17] = uSrc2.au8[24];
12725 uDstOut.au8[18] = uSrc1.au8[25];
12726 uDstOut.au8[19] = uSrc2.au8[25];
12727 uDstOut.au8[20] = uSrc1.au8[26];
12728 uDstOut.au8[21] = uSrc2.au8[26];
12729 uDstOut.au8[22] = uSrc1.au8[27];
12730 uDstOut.au8[23] = uSrc2.au8[27];
12731 uDstOut.au8[24] = uSrc1.au8[28];
12732 uDstOut.au8[25] = uSrc2.au8[28];
12733 uDstOut.au8[26] = uSrc1.au8[29];
12734 uDstOut.au8[27] = uSrc2.au8[29];
12735 uDstOut.au8[28] = uSrc1.au8[30];
12736 uDstOut.au8[29] = uSrc2.au8[30];
12737 uDstOut.au8[30] = uSrc1.au8[31];
12738 uDstOut.au8[31] = uSrc2.au8[31];
12739 *puDst = uDstOut;
12740}
12741
12742
12743/*
12744 * PUNPCKHBW - high words -> dwords
12745 */
12746#ifdef IEM_WITHOUT_ASSEMBLY
12747
12748IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12749{
12750 RTUINT64U const uSrc2 = { *puSrc };
12751 RTUINT64U const uSrc1 = { *puDst };
12752 ASMCompilerBarrier();
12753 RTUINT64U uDstOut;
12754 uDstOut.au16[0] = uSrc1.au16[2];
12755 uDstOut.au16[1] = uSrc2.au16[2];
12756 uDstOut.au16[2] = uSrc1.au16[3];
12757 uDstOut.au16[3] = uSrc2.au16[3];
12758 *puDst = uDstOut.u;
12759}
12760
12761
12762IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12763{
12764 RTUINT128U const uSrc2 = *puSrc;
12765 RTUINT128U const uSrc1 = *puDst;
12766 ASMCompilerBarrier();
12767 RTUINT128U uDstOut;
12768 uDstOut.au16[0] = uSrc1.au16[4];
12769 uDstOut.au16[1] = uSrc2.au16[4];
12770 uDstOut.au16[2] = uSrc1.au16[5];
12771 uDstOut.au16[3] = uSrc2.au16[5];
12772 uDstOut.au16[4] = uSrc1.au16[6];
12773 uDstOut.au16[5] = uSrc2.au16[6];
12774 uDstOut.au16[6] = uSrc1.au16[7];
12775 uDstOut.au16[7] = uSrc2.au16[7];
12776 *puDst = uDstOut;
12777}
12778
12779#endif
12780
12781IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12782{
12783 RTUINT128U const uSrc2 = *puSrc2;
12784 RTUINT128U const uSrc1 = *puSrc1;
12785 ASMCompilerBarrier();
12786 RTUINT128U uDstOut;
12787 uDstOut.au16[0] = uSrc1.au16[4];
12788 uDstOut.au16[1] = uSrc2.au16[4];
12789 uDstOut.au16[2] = uSrc1.au16[5];
12790 uDstOut.au16[3] = uSrc2.au16[5];
12791 uDstOut.au16[4] = uSrc1.au16[6];
12792 uDstOut.au16[5] = uSrc2.au16[6];
12793 uDstOut.au16[6] = uSrc1.au16[7];
12794 uDstOut.au16[7] = uSrc2.au16[7];
12795 *puDst = uDstOut;
12796}
12797
12798
12799IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12800{
12801 RTUINT256U const uSrc2 = *puSrc2;
12802 RTUINT256U const uSrc1 = *puSrc1;
12803 ASMCompilerBarrier();
12804 RTUINT256U uDstOut;
12805 uDstOut.au16[0] = uSrc1.au16[4];
12806 uDstOut.au16[1] = uSrc2.au16[4];
12807 uDstOut.au16[2] = uSrc1.au16[5];
12808 uDstOut.au16[3] = uSrc2.au16[5];
12809 uDstOut.au16[4] = uSrc1.au16[6];
12810 uDstOut.au16[5] = uSrc2.au16[6];
12811 uDstOut.au16[6] = uSrc1.au16[7];
12812 uDstOut.au16[7] = uSrc2.au16[7];
12813
12814 uDstOut.au16[8] = uSrc1.au16[12];
12815 uDstOut.au16[9] = uSrc2.au16[12];
12816 uDstOut.au16[10] = uSrc1.au16[13];
12817 uDstOut.au16[11] = uSrc2.au16[13];
12818 uDstOut.au16[12] = uSrc1.au16[14];
12819 uDstOut.au16[13] = uSrc2.au16[14];
12820 uDstOut.au16[14] = uSrc1.au16[15];
12821 uDstOut.au16[15] = uSrc2.au16[15];
12822 *puDst = uDstOut;
12823}
12824
12825
12826/*
12827 * PUNPCKHBW - high dwords -> qword(s)
12828 */
12829#ifdef IEM_WITHOUT_ASSEMBLY
12830
12831IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12832{
12833 RTUINT64U const uSrc2 = { *puSrc };
12834 RTUINT64U const uSrc1 = { *puDst };
12835 ASMCompilerBarrier();
12836 RTUINT64U uDstOut;
12837 uDstOut.au32[0] = uSrc1.au32[1];
12838 uDstOut.au32[1] = uSrc2.au32[1];
12839 *puDst = uDstOut.u;
12840}
12841
12842
12843IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12844{
12845 RTUINT128U const uSrc2 = *puSrc;
12846 RTUINT128U const uSrc1 = *puDst;
12847 ASMCompilerBarrier();
12848 RTUINT128U uDstOut;
12849 uDstOut.au32[0] = uSrc1.au32[2];
12850 uDstOut.au32[1] = uSrc2.au32[2];
12851 uDstOut.au32[2] = uSrc1.au32[3];
12852 uDstOut.au32[3] = uSrc2.au32[3];
12853 *puDst = uDstOut;
12854}
12855
12856#endif
12857
12858IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12859{
12860 RTUINT128U const uSrc2 = *puSrc2;
12861 RTUINT128U const uSrc1 = *puSrc1;
12862 ASMCompilerBarrier();
12863 RTUINT128U uDstOut;
12864 uDstOut.au32[0] = uSrc1.au32[2];
12865 uDstOut.au32[1] = uSrc2.au32[2];
12866 uDstOut.au32[2] = uSrc1.au32[3];
12867 uDstOut.au32[3] = uSrc2.au32[3];
12868 *puDst = uDstOut;
12869}
12870
12871
12872IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12873{
12874 RTUINT256U const uSrc2 = *puSrc2;
12875 RTUINT256U const uSrc1 = *puSrc1;
12876 ASMCompilerBarrier();
12877 RTUINT256U uDstOut;
12878 uDstOut.au32[0] = uSrc1.au32[2];
12879 uDstOut.au32[1] = uSrc2.au32[2];
12880 uDstOut.au32[2] = uSrc1.au32[3];
12881 uDstOut.au32[3] = uSrc2.au32[3];
12882
12883 uDstOut.au32[4] = uSrc1.au32[6];
12884 uDstOut.au32[5] = uSrc2.au32[6];
12885 uDstOut.au32[6] = uSrc1.au32[7];
12886 uDstOut.au32[7] = uSrc2.au32[7];
12887 *puDst = uDstOut;
12888}
12889
12890
12891/*
12892 * PUNPCKHQDQ -> High qwords -> double qword(s).
12893 */
12894#ifdef IEM_WITHOUT_ASSEMBLY
12895IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12896{
12897 RTUINT128U const uSrc2 = *puSrc;
12898 RTUINT128U const uSrc1 = *puDst;
12899 ASMCompilerBarrier();
12900 RTUINT128U uDstOut;
12901 uDstOut.au64[0] = uSrc1.au64[1];
12902 uDstOut.au64[1] = uSrc2.au64[1];
12903 *puDst = uDstOut;
12904}
12905#endif
12906
12907
12908IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12909{
12910 RTUINT128U const uSrc2 = *puSrc2;
12911 RTUINT128U const uSrc1 = *puSrc1;
12912 ASMCompilerBarrier();
12913 RTUINT128U uDstOut;
12914 uDstOut.au64[0] = uSrc1.au64[1];
12915 uDstOut.au64[1] = uSrc2.au64[1];
12916 *puDst = uDstOut;
12917}
12918
12919
12920IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12921{
12922 RTUINT256U const uSrc2 = *puSrc2;
12923 RTUINT256U const uSrc1 = *puSrc1;
12924 ASMCompilerBarrier();
12925 RTUINT256U uDstOut;
12926 uDstOut.au64[0] = uSrc1.au64[1];
12927 uDstOut.au64[1] = uSrc2.au64[1];
12928
12929 uDstOut.au64[2] = uSrc1.au64[3];
12930 uDstOut.au64[3] = uSrc2.au64[3];
12931 *puDst = uDstOut;
12932}
12933
12934
12935/*
12936 * PUNPCKLBW - low bytes -> words
12937 */
12938#ifdef IEM_WITHOUT_ASSEMBLY
12939
12940IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12941{
12942 RTUINT64U const uSrc2 = { *puSrc };
12943 RTUINT64U const uSrc1 = { *puDst };
12944 ASMCompilerBarrier();
12945 RTUINT64U uDstOut;
12946 uDstOut.au8[0] = uSrc1.au8[0];
12947 uDstOut.au8[1] = uSrc2.au8[0];
12948 uDstOut.au8[2] = uSrc1.au8[1];
12949 uDstOut.au8[3] = uSrc2.au8[1];
12950 uDstOut.au8[4] = uSrc1.au8[2];
12951 uDstOut.au8[5] = uSrc2.au8[2];
12952 uDstOut.au8[6] = uSrc1.au8[3];
12953 uDstOut.au8[7] = uSrc2.au8[3];
12954 *puDst = uDstOut.u;
12955}
12956
12957
12958IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12959{
12960 RTUINT128U const uSrc2 = *puSrc;
12961 RTUINT128U const uSrc1 = *puDst;
12962 ASMCompilerBarrier();
12963 RTUINT128U uDstOut;
12964 uDstOut.au8[ 0] = uSrc1.au8[0];
12965 uDstOut.au8[ 1] = uSrc2.au8[0];
12966 uDstOut.au8[ 2] = uSrc1.au8[1];
12967 uDstOut.au8[ 3] = uSrc2.au8[1];
12968 uDstOut.au8[ 4] = uSrc1.au8[2];
12969 uDstOut.au8[ 5] = uSrc2.au8[2];
12970 uDstOut.au8[ 6] = uSrc1.au8[3];
12971 uDstOut.au8[ 7] = uSrc2.au8[3];
12972 uDstOut.au8[ 8] = uSrc1.au8[4];
12973 uDstOut.au8[ 9] = uSrc2.au8[4];
12974 uDstOut.au8[10] = uSrc1.au8[5];
12975 uDstOut.au8[11] = uSrc2.au8[5];
12976 uDstOut.au8[12] = uSrc1.au8[6];
12977 uDstOut.au8[13] = uSrc2.au8[6];
12978 uDstOut.au8[14] = uSrc1.au8[7];
12979 uDstOut.au8[15] = uSrc2.au8[7];
12980 *puDst = uDstOut;
12981}
12982
12983#endif
12984
12985IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12986{
12987 RTUINT128U const uSrc2 = *puSrc2;
12988 RTUINT128U const uSrc1 = *puSrc1;
12989 ASMCompilerBarrier();
12990 RTUINT128U uDstOut;
12991 uDstOut.au8[ 0] = uSrc1.au8[0];
12992 uDstOut.au8[ 1] = uSrc2.au8[0];
12993 uDstOut.au8[ 2] = uSrc1.au8[1];
12994 uDstOut.au8[ 3] = uSrc2.au8[1];
12995 uDstOut.au8[ 4] = uSrc1.au8[2];
12996 uDstOut.au8[ 5] = uSrc2.au8[2];
12997 uDstOut.au8[ 6] = uSrc1.au8[3];
12998 uDstOut.au8[ 7] = uSrc2.au8[3];
12999 uDstOut.au8[ 8] = uSrc1.au8[4];
13000 uDstOut.au8[ 9] = uSrc2.au8[4];
13001 uDstOut.au8[10] = uSrc1.au8[5];
13002 uDstOut.au8[11] = uSrc2.au8[5];
13003 uDstOut.au8[12] = uSrc1.au8[6];
13004 uDstOut.au8[13] = uSrc2.au8[6];
13005 uDstOut.au8[14] = uSrc1.au8[7];
13006 uDstOut.au8[15] = uSrc2.au8[7];
13007 *puDst = uDstOut;
13008}
13009
13010
13011IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13012{
13013 RTUINT256U const uSrc2 = *puSrc2;
13014 RTUINT256U const uSrc1 = *puSrc1;
13015 ASMCompilerBarrier();
13016 RTUINT256U uDstOut;
13017 uDstOut.au8[ 0] = uSrc1.au8[0];
13018 uDstOut.au8[ 1] = uSrc2.au8[0];
13019 uDstOut.au8[ 2] = uSrc1.au8[1];
13020 uDstOut.au8[ 3] = uSrc2.au8[1];
13021 uDstOut.au8[ 4] = uSrc1.au8[2];
13022 uDstOut.au8[ 5] = uSrc2.au8[2];
13023 uDstOut.au8[ 6] = uSrc1.au8[3];
13024 uDstOut.au8[ 7] = uSrc2.au8[3];
13025 uDstOut.au8[ 8] = uSrc1.au8[4];
13026 uDstOut.au8[ 9] = uSrc2.au8[4];
13027 uDstOut.au8[10] = uSrc1.au8[5];
13028 uDstOut.au8[11] = uSrc2.au8[5];
13029 uDstOut.au8[12] = uSrc1.au8[6];
13030 uDstOut.au8[13] = uSrc2.au8[6];
13031 uDstOut.au8[14] = uSrc1.au8[7];
13032 uDstOut.au8[15] = uSrc2.au8[7];
13033 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
13034 uDstOut.au8[16] = uSrc1.au8[16];
13035 uDstOut.au8[17] = uSrc2.au8[16];
13036 uDstOut.au8[18] = uSrc1.au8[17];
13037 uDstOut.au8[19] = uSrc2.au8[17];
13038 uDstOut.au8[20] = uSrc1.au8[18];
13039 uDstOut.au8[21] = uSrc2.au8[18];
13040 uDstOut.au8[22] = uSrc1.au8[19];
13041 uDstOut.au8[23] = uSrc2.au8[19];
13042 uDstOut.au8[24] = uSrc1.au8[20];
13043 uDstOut.au8[25] = uSrc2.au8[20];
13044 uDstOut.au8[26] = uSrc1.au8[21];
13045 uDstOut.au8[27] = uSrc2.au8[21];
13046 uDstOut.au8[28] = uSrc1.au8[22];
13047 uDstOut.au8[29] = uSrc2.au8[22];
13048 uDstOut.au8[30] = uSrc1.au8[23];
13049 uDstOut.au8[31] = uSrc2.au8[23];
13050 *puDst = uDstOut;
13051}
13052
13053
13054/*
13055 * PUNPCKLBW - low words -> dwords
13056 */
13057#ifdef IEM_WITHOUT_ASSEMBLY
13058
13059IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
13060{
13061 RTUINT64U const uSrc2 = { *puSrc };
13062 RTUINT64U const uSrc1 = { *puDst };
13063 ASMCompilerBarrier();
13064 RTUINT64U uDstOut;
13065 uDstOut.au16[0] = uSrc1.au16[0];
13066 uDstOut.au16[1] = uSrc2.au16[0];
13067 uDstOut.au16[2] = uSrc1.au16[1];
13068 uDstOut.au16[3] = uSrc2.au16[1];
13069 *puDst = uDstOut.u;
13070}
13071
13072
13073IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13074{
13075 RTUINT128U const uSrc2 = *puSrc;
13076 RTUINT128U const uSrc1 = *puDst;
13077 ASMCompilerBarrier();
13078 RTUINT128U uDstOut;
13079 uDstOut.au16[0] = uSrc1.au16[0];
13080 uDstOut.au16[1] = uSrc2.au16[0];
13081 uDstOut.au16[2] = uSrc1.au16[1];
13082 uDstOut.au16[3] = uSrc2.au16[1];
13083 uDstOut.au16[4] = uSrc1.au16[2];
13084 uDstOut.au16[5] = uSrc2.au16[2];
13085 uDstOut.au16[6] = uSrc1.au16[3];
13086 uDstOut.au16[7] = uSrc2.au16[3];
13087 *puDst = uDstOut;
13088}
13089
13090#endif
13091
13092IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13093{
13094 RTUINT128U const uSrc2 = *puSrc2;
13095 RTUINT128U const uSrc1 = *puSrc1;
13096 ASMCompilerBarrier();
13097 RTUINT128U uDstOut;
13098 uDstOut.au16[0] = uSrc1.au16[0];
13099 uDstOut.au16[1] = uSrc2.au16[0];
13100 uDstOut.au16[2] = uSrc1.au16[1];
13101 uDstOut.au16[3] = uSrc2.au16[1];
13102 uDstOut.au16[4] = uSrc1.au16[2];
13103 uDstOut.au16[5] = uSrc2.au16[2];
13104 uDstOut.au16[6] = uSrc1.au16[3];
13105 uDstOut.au16[7] = uSrc2.au16[3];
13106 *puDst = uDstOut;
13107}
13108
13109
13110IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13111{
13112 RTUINT256U const uSrc2 = *puSrc2;
13113 RTUINT256U const uSrc1 = *puSrc1;
13114 ASMCompilerBarrier();
13115 RTUINT256U uDstOut;
13116 uDstOut.au16[0] = uSrc1.au16[0];
13117 uDstOut.au16[1] = uSrc2.au16[0];
13118 uDstOut.au16[2] = uSrc1.au16[1];
13119 uDstOut.au16[3] = uSrc2.au16[1];
13120 uDstOut.au16[4] = uSrc1.au16[2];
13121 uDstOut.au16[5] = uSrc2.au16[2];
13122 uDstOut.au16[6] = uSrc1.au16[3];
13123 uDstOut.au16[7] = uSrc2.au16[3];
13124
13125 uDstOut.au16[8] = uSrc1.au16[8];
13126 uDstOut.au16[9] = uSrc2.au16[8];
13127 uDstOut.au16[10] = uSrc1.au16[9];
13128 uDstOut.au16[11] = uSrc2.au16[9];
13129 uDstOut.au16[12] = uSrc1.au16[10];
13130 uDstOut.au16[13] = uSrc2.au16[10];
13131 uDstOut.au16[14] = uSrc1.au16[11];
13132 uDstOut.au16[15] = uSrc2.au16[11];
13133 *puDst = uDstOut;
13134}
13135
13136
13137/*
13138 * PUNPCKLBW - low dwords -> qword(s)
13139 */
13140#ifdef IEM_WITHOUT_ASSEMBLY
13141
13142IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13143{
13144 RTUINT64U const uSrc2 = { *puSrc };
13145 RTUINT64U const uSrc1 = { *puDst };
13146 ASMCompilerBarrier();
13147 RTUINT64U uDstOut;
13148 uDstOut.au32[0] = uSrc1.au32[0];
13149 uDstOut.au32[1] = uSrc2.au32[0];
13150 *puDst = uDstOut.u;
13151}
13152
13153
13154IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13155{
13156 RTUINT128U const uSrc2 = *puSrc;
13157 RTUINT128U const uSrc1 = *puDst;
13158 ASMCompilerBarrier();
13159 RTUINT128U uDstOut;
13160 uDstOut.au32[0] = uSrc1.au32[0];
13161 uDstOut.au32[1] = uSrc2.au32[0];
13162 uDstOut.au32[2] = uSrc1.au32[1];
13163 uDstOut.au32[3] = uSrc2.au32[1];
13164 *puDst = uDstOut;
13165}
13166
13167#endif
13168
13169IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13170{
13171 RTUINT128U const uSrc2 = *puSrc2;
13172 RTUINT128U const uSrc1 = *puSrc1;
13173 ASMCompilerBarrier();
13174 RTUINT128U uDstOut;
13175 uDstOut.au32[0] = uSrc1.au32[0];
13176 uDstOut.au32[1] = uSrc2.au32[0];
13177 uDstOut.au32[2] = uSrc1.au32[1];
13178 uDstOut.au32[3] = uSrc2.au32[1];
13179 *puDst = uDstOut;
13180}
13181
13182
13183IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13184{
13185 RTUINT256U const uSrc2 = *puSrc2;
13186 RTUINT256U const uSrc1 = *puSrc1;
13187 ASMCompilerBarrier();
13188 RTUINT256U uDstOut;
13189 uDstOut.au32[0] = uSrc1.au32[0];
13190 uDstOut.au32[1] = uSrc2.au32[0];
13191 uDstOut.au32[2] = uSrc1.au32[1];
13192 uDstOut.au32[3] = uSrc2.au32[1];
13193
13194 uDstOut.au32[4] = uSrc1.au32[4];
13195 uDstOut.au32[5] = uSrc2.au32[4];
13196 uDstOut.au32[6] = uSrc1.au32[5];
13197 uDstOut.au32[7] = uSrc2.au32[5];
13198 *puDst = uDstOut;
13199}
13200
13201
13202/*
13203 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13204 */
13205#ifdef IEM_WITHOUT_ASSEMBLY
13206IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13207{
13208 RTUINT128U const uSrc2 = *puSrc;
13209 RTUINT128U const uSrc1 = *puDst;
13210 ASMCompilerBarrier();
13211 RTUINT128U uDstOut;
13212 uDstOut.au64[0] = uSrc1.au64[0];
13213 uDstOut.au64[1] = uSrc2.au64[0];
13214 *puDst = uDstOut;
13215}
13216#endif
13217
13218
13219IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13220{
13221 RTUINT128U const uSrc2 = *puSrc2;
13222 RTUINT128U const uSrc1 = *puSrc1;
13223 ASMCompilerBarrier();
13224 RTUINT128U uDstOut;
13225 uDstOut.au64[0] = uSrc1.au64[0];
13226 uDstOut.au64[1] = uSrc2.au64[0];
13227 *puDst = uDstOut;
13228}
13229
13230
13231IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13232{
13233 RTUINT256U const uSrc2 = *puSrc2;
13234 RTUINT256U const uSrc1 = *puSrc1;
13235 ASMCompilerBarrier();
13236 RTUINT256U uDstOut;
13237 uDstOut.au64[0] = uSrc1.au64[0];
13238 uDstOut.au64[1] = uSrc2.au64[0];
13239
13240 uDstOut.au64[2] = uSrc1.au64[2];
13241 uDstOut.au64[3] = uSrc2.au64[2];
13242 *puDst = uDstOut;
13243}
13244
13245
13246/*
13247 * MASKMOVQ - Store Selected Bytes of Quadword
13248 */
13249IEM_DECL_IMPL_DEF(void, iemAImpl_maskmovq_u64,(uint64_t *puMem, uint64_t const *puSrc, uint64_t const *puMsk))
13250{
13251 ASMCompilerBarrier();
13252 for (uint32_t i = 0; i < RT_ELEMENTS(((PCRTUINT64U)puMsk)->au8); i++)
13253 {
13254 if (((PCRTUINT64U)puMsk)->au8[i] & RT_BIT(7))
13255 ((PRTUINT64U)puMem)->au8[i] = ((PCRTUINT64U)puSrc)->au8[i];
13256 }
13257}
13258
13259
13260/*
13261 * MASKMOVDQU - Store Selected Bytes of Double Quadword
13262 */
13263IEM_DECL_IMPL_DEF(void, iemAImpl_maskmovdqu_u128,(PRTUINT128U puMem, PCRTUINT128U puSrc, PCRTUINT128U puMsk))
13264{
13265 ASMCompilerBarrier();
13266 for (uint32_t i = 0; i < RT_ELEMENTS(puMsk->au8); i++)
13267 {
13268 if (puMsk->au8[i] & RT_BIT(7))
13269 puMem->au8[i] = puSrc->au8[i];
13270 }
13271}
13272
13273
13274/*
13275 * PACKSSWB - signed words -> signed bytes
13276 */
13277
13278#ifdef IEM_WITHOUT_ASSEMBLY
13279
13280IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13281{
13282 RTUINT64U const uSrc2 = { *puSrc };
13283 RTUINT64U const uSrc1 = { *puDst };
13284 ASMCompilerBarrier();
13285 RTUINT64U uDstOut;
13286 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13287 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13288 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13289 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13290 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13291 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13292 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13293 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13294 *puDst = uDstOut.u;
13295}
13296
13297
13298IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13299{
13300 RTUINT128U const uSrc2 = *puSrc;
13301 RTUINT128U const uSrc1 = *puDst;
13302 ASMCompilerBarrier();
13303 RTUINT128U uDstOut;
13304 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13305 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13306 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13307 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13308 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13309 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13310 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13311 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13312 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13313 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13314 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13315 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13316 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13317 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13318 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13319 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13320 *puDst = uDstOut;
13321}
13322
13323#endif
13324
13325IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13326{
13327 RTUINT128U const uSrc2 = *puSrc2;
13328 RTUINT128U const uSrc1 = *puSrc1;
13329 ASMCompilerBarrier();
13330 RTUINT128U uDstOut;
13331 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13332 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13333 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13334 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13335 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13336 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13337 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13338 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13339 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13340 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13341 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13342 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13343 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13344 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13345 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13346 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13347 *puDst = uDstOut;
13348}
13349
13350
13351IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13352{
13353 RTUINT256U const uSrc2 = *puSrc2;
13354 RTUINT256U const uSrc1 = *puSrc1;
13355 ASMCompilerBarrier();
13356 RTUINT256U uDstOut;
13357 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13358 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13359 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13360 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13361 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13362 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13363 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13364 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13365 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13366 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13367 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13368 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13369 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13370 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13371 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13372 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13373
13374 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13375 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13376 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13377 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13378 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13379 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13380 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13381 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13382 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13383 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13384 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13385 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13386 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13387 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13388 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13389 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13390 *puDst = uDstOut;
13391}
13392
13393
13394/*
13395 * PACKUSWB - signed words -> unsigned bytes
13396 */
13397#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13398 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13399 ? (uint8_t)(a_iWord) \
13400 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13401
13402#ifdef IEM_WITHOUT_ASSEMBLY
13403
13404IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13405{
13406 RTUINT64U const uSrc2 = { *puSrc };
13407 RTUINT64U const uSrc1 = { *puDst };
13408 ASMCompilerBarrier();
13409 RTUINT64U uDstOut;
13410 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13411 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13412 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13413 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13414 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13415 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13416 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13417 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13418 *puDst = uDstOut.u;
13419}
13420
13421
13422IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13423{
13424 RTUINT128U const uSrc2 = *puSrc;
13425 RTUINT128U const uSrc1 = *puDst;
13426 ASMCompilerBarrier();
13427 RTUINT128U uDstOut;
13428 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13429 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13430 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13431 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13432 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13433 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13434 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13435 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13436 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13437 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13438 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13439 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13440 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13441 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13442 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13443 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13444 *puDst = uDstOut;
13445}
13446
13447#endif
13448
13449IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13450{
13451 RTUINT128U const uSrc2 = *puSrc2;
13452 RTUINT128U const uSrc1 = *puSrc1;
13453 ASMCompilerBarrier();
13454 RTUINT128U uDstOut;
13455 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13456 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13457 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13458 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13459 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13460 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13461 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13462 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13463 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13464 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13465 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13466 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13467 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13468 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13469 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13470 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13471 *puDst = uDstOut;
13472}
13473
13474
13475IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13476{
13477 RTUINT256U const uSrc2 = *puSrc2;
13478 RTUINT256U const uSrc1 = *puSrc1;
13479 ASMCompilerBarrier();
13480 RTUINT256U uDstOut;
13481 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13482 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13483 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13484 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13485 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13486 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13487 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13488 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13489 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13490 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13491 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13492 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13493 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13494 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13495 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13496 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13497
13498 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13499 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13500 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13501 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13502 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13503 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13504 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13505 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13506 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13507 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13508 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13509 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13510 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13511 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13512 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13513 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13514 *puDst = uDstOut;
13515}
13516
13517
13518/*
13519 * PACKSSDW - signed dwords -> signed words
13520 */
13521
13522#ifdef IEM_WITHOUT_ASSEMBLY
13523
13524IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13525{
13526 RTUINT64U const uSrc2 = { *puSrc };
13527 RTUINT64U const uSrc1 = { *puDst };
13528 ASMCompilerBarrier();
13529 RTUINT64U uDstOut;
13530 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13531 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13532 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13533 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13534 *puDst = uDstOut.u;
13535}
13536
13537
13538IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13539{
13540 RTUINT128U const uSrc2 = *puSrc;
13541 RTUINT128U const uSrc1 = *puDst;
13542 ASMCompilerBarrier();
13543 RTUINT128U uDstOut;
13544 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13545 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13546 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13547 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13548 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13549 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13550 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13551 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13552 *puDst = uDstOut;
13553}
13554
13555#endif
13556
13557IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13558{
13559 RTUINT128U const uSrc2 = *puSrc2;
13560 RTUINT128U const uSrc1 = *puSrc1;
13561 ASMCompilerBarrier();
13562 RTUINT128U uDstOut;
13563 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13564 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13565 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13566 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13567 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13568 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13569 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13570 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13571 *puDst = uDstOut;
13572}
13573
13574
13575IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13576{
13577 RTUINT256U const uSrc2 = *puSrc2;
13578 RTUINT256U const uSrc1 = *puSrc1;
13579 ASMCompilerBarrier();
13580 RTUINT256U uDstOut;
13581 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13582 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13583 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13584 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13585 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13586 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13587 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13588 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13589
13590 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13591 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13592 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13593 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13594 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13595 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13596 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13597 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13598 *puDst = uDstOut;
13599}
13600
13601
13602/*
13603 * PACKUSDW - signed dwords -> unsigned words
13604 */
13605#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13606 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13607 ? (uint16_t)(a_iDword) \
13608 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13609
13610#ifdef IEM_WITHOUT_ASSEMBLY
13611IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13612{
13613 RTUINT128U const uSrc2 = *puSrc;
13614 RTUINT128U const uSrc1 = *puDst;
13615 ASMCompilerBarrier();
13616 RTUINT128U uDstOut;
13617 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13618 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13619 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13620 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13621 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13622 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13623 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13624 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13625 *puDst = uDstOut;
13626}
13627#endif
13628
13629IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13630{
13631 RTUINT128U const uSrc2 = *puSrc2;
13632 RTUINT128U const uSrc1 = *puSrc1;
13633 ASMCompilerBarrier();
13634 RTUINT128U uDstOut;
13635 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13636 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13637 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13638 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13639 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13640 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13641 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13642 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13643 *puDst = uDstOut;
13644}
13645
13646
13647IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13648{
13649 RTUINT256U const uSrc2 = *puSrc2;
13650 RTUINT256U const uSrc1 = *puSrc1;
13651 ASMCompilerBarrier();
13652 RTUINT256U uDstOut;
13653 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13654 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13655 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13656 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13657 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13658 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13659 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13660 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13661
13662 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13663 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13664 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13665 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13666 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13667 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13668 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13669 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13670 *puDst = uDstOut;
13671}
13672
13673
13674/*
13675 * [V]PABSB / [V]PABSW / [V]PABSD
13676 */
13677
13678IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13679{
13680 RTUINT64U const uSrc = { *puSrc };
13681 RTUINT64U uDstOut = { 0 };
13682
13683 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13684 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13685 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13686 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13687 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13688 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13689 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13690 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13691 *puDst = uDstOut.u;
13692}
13693
13694
13695IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13696{
13697 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13698 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13699 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13700 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13701 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13702 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13703 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13704 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13705 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13706 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13707 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13708 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13709 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13710 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13711 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13712 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13713}
13714
13715
13716IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13717{
13718 RTUINT64U const uSrc = { *puSrc };
13719 RTUINT64U uDstOut = { 0 };
13720
13721 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13722 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13723 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13724 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13725 *puDst = uDstOut.u;
13726}
13727
13728
13729IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13730{
13731 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13732 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13733 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13734 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13735 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13736 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13737 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13738 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13739}
13740
13741
13742IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13743{
13744 RTUINT64U const uSrc = { *puSrc };
13745 RTUINT64U uDstOut = { 0 };
13746
13747 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13748 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13749 *puDst = uDstOut.u;
13750}
13751
13752
13753IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13754{
13755 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13756 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13757 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13758 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13759}
13760
13761
13762IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13763{
13764 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13765 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13766 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13767 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13768 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13769 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13770 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13771 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13772 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13773 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13774 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13775 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13776 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13777 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13778 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13779 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13780}
13781
13782
13783IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13784{
13785 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13786 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13787 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13788 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13789 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13790 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13791 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13792 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13793 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13794 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13795 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13796 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13797 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13798 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13799 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13800 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13801 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13802 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13803 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13804 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13805 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13806 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13807 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13808 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13809 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13810 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13811 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13812 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13813 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13814 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13815 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13816 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13817}
13818
13819
13820IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13821{
13822 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13823 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13824 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13825 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13826 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13827 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13828 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13829 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13830}
13831
13832
13833IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13834{
13835 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13836 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13837 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13838 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13839 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13840 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13841 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13842 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13843 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13844 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13845 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13846 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13847 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13848 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13849 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13850 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13851}
13852
13853
13854IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13855{
13856 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13857 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13858 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13859 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13860}
13861
13862
13863IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13864{
13865 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13866 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13867 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13868 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13869 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13870 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13871 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13872 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13873}
13874
13875
13876/*
13877 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13878 */
13879IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13880{
13881 RTUINT64U uSrc1 = { *puDst };
13882 RTUINT64U uSrc2 = { *puSrc };
13883 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13884
13885 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13886 {
13887 if (uSrc2.ai8[i] < 0)
13888 uDst.ai8[i] = -uSrc1.ai8[i];
13889 else if (uSrc2.ai8[i] == 0)
13890 uDst.ai8[i] = 0;
13891 else /* uSrc2.ai8[i] > 0 */
13892 uDst.ai8[i] = uSrc1.ai8[i];
13893 }
13894
13895 *puDst = uDst.u;
13896}
13897
13898
13899IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13900{
13901 RTUINT128U uSrc1 = *puDst;
13902
13903 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13904 {
13905 if (puSrc->ai8[i] < 0)
13906 puDst->ai8[i] = -uSrc1.ai8[i];
13907 else if (puSrc->ai8[i] == 0)
13908 puDst->ai8[i] = 0;
13909 else /* puSrc->ai8[i] > 0 */
13910 puDst->ai8[i] = uSrc1.ai8[i];
13911 }
13912}
13913
13914
13915IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13916{
13917 RTUINT64U uSrc1 = { *puDst };
13918 RTUINT64U uSrc2 = { *puSrc };
13919 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13920
13921 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13922 {
13923 if (uSrc2.ai16[i] < 0)
13924 uDst.ai16[i] = -uSrc1.ai16[i];
13925 else if (uSrc2.ai16[i] == 0)
13926 uDst.ai16[i] = 0;
13927 else /* uSrc2.ai16[i] > 0 */
13928 uDst.ai16[i] = uSrc1.ai16[i];
13929 }
13930
13931 *puDst = uDst.u;
13932}
13933
13934
13935IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13936{
13937 RTUINT128U uSrc1 = *puDst;
13938
13939 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13940 {
13941 if (puSrc->ai16[i] < 0)
13942 puDst->ai16[i] = -uSrc1.ai16[i];
13943 else if (puSrc->ai16[i] == 0)
13944 puDst->ai16[i] = 0;
13945 else /* puSrc->ai16[i] > 0 */
13946 puDst->ai16[i] = uSrc1.ai16[i];
13947 }
13948}
13949
13950
13951IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13952{
13953 RTUINT64U uSrc1 = { *puDst };
13954 RTUINT64U uSrc2 = { *puSrc };
13955 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13956
13957 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13958 {
13959 if (uSrc2.ai32[i] < 0)
13960 uDst.ai32[i] = -uSrc1.ai32[i];
13961 else if (uSrc2.ai32[i] == 0)
13962 uDst.ai32[i] = 0;
13963 else /* uSrc2.ai32[i] > 0 */
13964 uDst.ai32[i] = uSrc1.ai32[i];
13965 }
13966
13967 *puDst = uDst.u;
13968}
13969
13970
13971IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13972{
13973 RTUINT128U uSrc1 = *puDst;
13974
13975 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13976 {
13977 if (puSrc->ai32[i] < 0)
13978 puDst->ai32[i] = -uSrc1.ai32[i];
13979 else if (puSrc->ai32[i] == 0)
13980 puDst->ai32[i] = 0;
13981 else /* puSrc->ai32[i] > 0 */
13982 puDst->ai32[i] = uSrc1.ai32[i];
13983 }
13984}
13985
13986
13987IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13988{
13989 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13990 {
13991 if (puSrc2->ai8[i] < 0)
13992 puDst->ai8[i] = -puSrc1->ai8[i];
13993 else if (puSrc2->ai8[i] == 0)
13994 puDst->ai8[i] = 0;
13995 else /* puSrc2->ai8[i] > 0 */
13996 puDst->ai8[i] = puSrc1->ai8[i];
13997 }
13998}
13999
14000
14001IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14002{
14003 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
14004 {
14005 if (puSrc2->ai8[i] < 0)
14006 puDst->ai8[i] = -puSrc1->ai8[i];
14007 else if (puSrc2->ai8[i] == 0)
14008 puDst->ai8[i] = 0;
14009 else /* puSrc2->ai8[i] > 0 */
14010 puDst->ai8[i] = puSrc1->ai8[i];
14011 }
14012}
14013
14014
14015IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14016{
14017 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14018 {
14019 if (puSrc2->ai16[i] < 0)
14020 puDst->ai16[i] = -puSrc1->ai16[i];
14021 else if (puSrc2->ai16[i] == 0)
14022 puDst->ai16[i] = 0;
14023 else /* puSrc2->ai16[i] > 0 */
14024 puDst->ai16[i] = puSrc1->ai16[i];
14025 }
14026}
14027
14028
14029IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14030{
14031 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14032 {
14033 if (puSrc2->ai16[i] < 0)
14034 puDst->ai16[i] = -puSrc1->ai16[i];
14035 else if (puSrc2->ai16[i] == 0)
14036 puDst->ai16[i] = 0;
14037 else /* puSrc2->ai16[i] > 0 */
14038 puDst->ai16[i] = puSrc1->ai16[i];
14039 }
14040}
14041
14042
14043IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14044{
14045 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14046 {
14047 if (puSrc2->ai32[i] < 0)
14048 puDst->ai32[i] = -puSrc1->ai32[i];
14049 else if (puSrc2->ai32[i] == 0)
14050 puDst->ai32[i] = 0;
14051 else /* puSrc2->ai32[i] > 0 */
14052 puDst->ai32[i] = puSrc1->ai32[i];
14053 }
14054}
14055
14056
14057IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14058{
14059 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14060 {
14061 if (puSrc2->ai32[i] < 0)
14062 puDst->ai32[i] = -puSrc1->ai32[i];
14063 else if (puSrc2->ai32[i] == 0)
14064 puDst->ai32[i] = 0;
14065 else /* puSrc2->ai32[i] > 0 */
14066 puDst->ai32[i] = puSrc1->ai32[i];
14067 }
14068}
14069
14070
14071/*
14072 * PHADDW / VPHADDW / PHADDD / VPHADDD
14073 */
14074IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14075{
14076 RTUINT64U uSrc1 = { *puDst };
14077 RTUINT64U uSrc2 = { *puSrc };
14078 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14079
14080 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14081 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14082 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14083 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14084 *puDst = uDst.u;
14085}
14086
14087
14088IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14089{
14090 RTUINT128U uSrc1 = *puDst;
14091
14092 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14093 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14094 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14095 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14096
14097 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14098 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14099 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14100 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14101}
14102
14103
14104IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14105{
14106 RTUINT64U uSrc1 = { *puDst };
14107 RTUINT64U uSrc2 = { *puSrc };
14108 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14109
14110 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14111 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14112 *puDst = uDst.u;
14113}
14114
14115
14116IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14117{
14118 RTUINT128U uSrc1 = *puDst;
14119
14120 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14121 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14122
14123 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14124 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14125}
14126
14127
14128IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14129{
14130 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14131
14132 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14133 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14134 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14135 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14136
14137 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14138 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14139 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14140 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14141
14142 puDst->au64[0] = uDst.au64[0];
14143 puDst->au64[1] = uDst.au64[1];
14144}
14145
14146
14147IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14148{
14149 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14150
14151 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14152 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14153 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14154 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14155 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14156 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14157 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14158 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14159
14160 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14161 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14162 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14163 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14164 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14165 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14166 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14167 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14168
14169 puDst->au64[0] = uDst.au64[0];
14170 puDst->au64[1] = uDst.au64[1];
14171 puDst->au64[2] = uDst.au64[2];
14172 puDst->au64[3] = uDst.au64[3];
14173}
14174
14175
14176IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14177{
14178 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14179
14180 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14181 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14182
14183 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14184 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14185
14186 puDst->au64[0] = uDst.au64[0];
14187 puDst->au64[1] = uDst.au64[1];
14188}
14189
14190
14191IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14192{
14193 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14194
14195 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14196 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14197 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14198 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14199
14200 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14201 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14202 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14203 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14204
14205 puDst->au64[0] = uDst.au64[0];
14206 puDst->au64[1] = uDst.au64[1];
14207 puDst->au64[2] = uDst.au64[2];
14208 puDst->au64[3] = uDst.au64[3];
14209}
14210
14211
14212/*
14213 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14214 */
14215IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14216{
14217 RTUINT64U uSrc1 = { *puDst };
14218 RTUINT64U uSrc2 = { *puSrc };
14219 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14220
14221 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14222 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14223 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14224 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14225 *puDst = uDst.u;
14226}
14227
14228
14229IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14230{
14231 RTUINT128U uSrc1 = *puDst;
14232
14233 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14234 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14235 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14236 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14237
14238 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14239 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14240 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14241 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14242}
14243
14244
14245IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14246{
14247 RTUINT64U uSrc1 = { *puDst };
14248 RTUINT64U uSrc2 = { *puSrc };
14249 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14250
14251 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14252 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14253 *puDst = uDst.u;
14254}
14255
14256
14257IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14258{
14259 RTUINT128U uSrc1 = *puDst;
14260
14261 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14262 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14263
14264 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14265 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14266}
14267
14268
14269IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14270{
14271 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14272
14273 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14274 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14275 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14276 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14277
14278 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14279 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14280 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14281 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14282
14283 puDst->au64[0] = uDst.au64[0];
14284 puDst->au64[1] = uDst.au64[1];
14285}
14286
14287
14288IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14289{
14290 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14291
14292 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14293 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14294 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14295 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14296 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14297 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14298 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14299 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14300
14301 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14302 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14303 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14304 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14305 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14306 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14307 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14308 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14309
14310 puDst->au64[0] = uDst.au64[0];
14311 puDst->au64[1] = uDst.au64[1];
14312 puDst->au64[2] = uDst.au64[2];
14313 puDst->au64[3] = uDst.au64[3];
14314}
14315
14316
14317IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14318{
14319 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14320
14321 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14322 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14323
14324 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14325 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14326
14327 puDst->au64[0] = uDst.au64[0];
14328 puDst->au64[1] = uDst.au64[1];
14329}
14330
14331
14332IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14333{
14334 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14335
14336 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14337 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14338 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14339 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14340
14341 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14342 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14343 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14344 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14345
14346 puDst->au64[0] = uDst.au64[0];
14347 puDst->au64[1] = uDst.au64[1];
14348 puDst->au64[2] = uDst.au64[2];
14349 puDst->au64[3] = uDst.au64[3];
14350}
14351
14352
14353/*
14354 * PHADDSW / VPHADDSW
14355 */
14356IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14357{
14358 RTUINT64U uSrc1 = { *puDst };
14359 RTUINT64U uSrc2 = { *puSrc };
14360 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14361
14362 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14363 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14364 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14365 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14366 *puDst = uDst.u;
14367}
14368
14369
14370IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14371{
14372 RTUINT128U uSrc1 = *puDst;
14373
14374 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14375 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14376 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14377 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14378
14379 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14380 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14381 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14382 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14383}
14384
14385
14386IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14387{
14388 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14389
14390 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14391 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14392 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14393 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14394
14395 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14396 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14397 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14398 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14399
14400 puDst->au64[0] = uDst.au64[0];
14401 puDst->au64[1] = uDst.au64[1];
14402}
14403
14404
14405IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14406{
14407 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14408
14409 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14410 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14411 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14412 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14413 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14414 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14415 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14416 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14417
14418 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14419 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14420 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14421 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14422 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14423 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14424 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14425 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14426
14427 puDst->au64[0] = uDst.au64[0];
14428 puDst->au64[1] = uDst.au64[1];
14429 puDst->au64[2] = uDst.au64[2];
14430 puDst->au64[3] = uDst.au64[3];
14431}
14432
14433
14434/*
14435 * PHSUBSW / VPHSUBSW
14436 */
14437IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14438{
14439 RTUINT64U uSrc1 = { *puDst };
14440 RTUINT64U uSrc2 = { *puSrc };
14441 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14442
14443 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14444 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14445 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14446 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14447 *puDst = uDst.u;
14448}
14449
14450
14451IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14452{
14453 RTUINT128U uSrc1 = *puDst;
14454
14455 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14456 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14457 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14458 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14459
14460 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14461 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14462 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14463 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14464}
14465
14466
14467IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14468{
14469 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14470
14471 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14472 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14473 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14474 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14475
14476 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14477 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14478 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14479 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14480
14481 puDst->au64[0] = uDst.au64[0];
14482 puDst->au64[1] = uDst.au64[1];
14483}
14484
14485
14486IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14487{
14488 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14489
14490 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14491 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14492 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14493 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14494 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14495 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14496 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14497 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14498
14499 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14500 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14501 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14502 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14503 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14504 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14505 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14506 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14507
14508 puDst->au64[0] = uDst.au64[0];
14509 puDst->au64[1] = uDst.au64[1];
14510 puDst->au64[2] = uDst.au64[2];
14511 puDst->au64[3] = uDst.au64[3];
14512}
14513
14514
14515/*
14516 * PMADDUBSW / VPMADDUBSW
14517 */
14518IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14519{
14520 RTUINT64U uSrc1 = { *puDst };
14521 RTUINT64U uSrc2 = { *puSrc };
14522 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14523
14524 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14525 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14526 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14527 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14528 *puDst = uDst.u;
14529}
14530
14531
14532IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14533{
14534 RTUINT128U uSrc1 = *puDst;
14535
14536 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14537 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14538 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14539 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14540 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14541 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14542 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14543 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14544}
14545
14546
14547IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14548{
14549 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14550
14551 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14552 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14553 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14554 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14555 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14556 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14557 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14558 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14559
14560 puDst->au64[0] = uDst.au64[0];
14561 puDst->au64[1] = uDst.au64[1];
14562}
14563
14564
14565IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14566{
14567 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14568
14569 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14570 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14571 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14572 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14573 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14574 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14575 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14576 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14577 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14578 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14579 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14580 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14581 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14582 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14583 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14584 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14585
14586 puDst->au64[0] = uDst.au64[0];
14587 puDst->au64[1] = uDst.au64[1];
14588 puDst->au64[2] = uDst.au64[2];
14589 puDst->au64[3] = uDst.au64[3];
14590}
14591
14592
14593/*
14594 * PMULHRSW / VPMULHRSW
14595 */
14596#define DO_PMULHRSW(a_Src1, a_Src2) \
14597 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14598
14599IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14600{
14601 RTUINT64U uSrc1 = { *puDst };
14602 RTUINT64U uSrc2 = { *puSrc };
14603 RTUINT64U uDst;
14604
14605 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14606 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14607 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14608 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14609 *puDst = uDst.u;
14610}
14611
14612
14613IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14614{
14615 RTUINT128U uSrc1 = *puDst;
14616
14617 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14618 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14619 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14620 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14621 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14622 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14623 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14624 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14625}
14626
14627
14628IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14629{
14630 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14631
14632 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14633 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14634 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14635 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14636 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14637 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14638 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14639 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14640
14641 puDst->au64[0] = uDst.au64[0];
14642 puDst->au64[1] = uDst.au64[1];
14643}
14644
14645
14646IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14647{
14648 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14649
14650 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14651 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14652 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14653 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14654 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14655 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14656 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14657 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14658 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14659 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14660 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14661 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14662 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14663 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14664 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14665 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14666
14667 puDst->au64[0] = uDst.au64[0];
14668 puDst->au64[1] = uDst.au64[1];
14669 puDst->au64[2] = uDst.au64[2];
14670 puDst->au64[3] = uDst.au64[3];
14671}
14672
14673
14674/*
14675 * PSADBW / VPSADBW
14676 */
14677#ifdef IEM_WITHOUT_ASSEMBLY
14678
14679IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14680{
14681 RTUINT64U uSrc1 = { *puDst };
14682 RTUINT64U uSrc2 = { *puSrc };
14683 RTUINT64U uDst;
14684 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14685 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14686 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14687 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14688 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14689 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14690 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14691 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14692
14693 uDst.au64[0] = 0;
14694 uDst.au16[0] = uSum;
14695 *puDst = uDst.u;
14696}
14697
14698
14699IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14700{
14701 RTUINT128U uSrc1 = *puDst;
14702
14703 puDst->au64[0] = 0;
14704 puDst->au64[1] = 0;
14705
14706 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14707 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14708 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14709 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14710 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14711 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14712 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14713 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14714 puDst->au16[0] = uSum;
14715
14716 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14717 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14718 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14719 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14720 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14721 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14722 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14723 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14724 puDst->au16[4] = uSum;
14725}
14726
14727#endif
14728
14729IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14730{
14731 RTUINT128U uSrc1 = *puSrc1;
14732 RTUINT128U uSrc2 = *puSrc2;
14733
14734 puDst->au64[0] = 0;
14735 puDst->au64[1] = 0;
14736
14737 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14738 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14739 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14740 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14741 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14742 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14743 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14744 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14745 puDst->au16[0] = uSum;
14746
14747 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14748 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14749 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14750 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14751 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14752 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14753 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14754 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14755 puDst->au16[4] = uSum;
14756}
14757
14758IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14759{
14760 RTUINT256U uSrc1 = *puSrc1;
14761 RTUINT256U uSrc2 = *puSrc2;
14762
14763 puDst->au64[0] = 0;
14764 puDst->au64[1] = 0;
14765 puDst->au64[2] = 0;
14766 puDst->au64[3] = 0;
14767
14768 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14769 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14770 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14771 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14772 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14773 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14774 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14775 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14776 puDst->au16[0] = uSum;
14777
14778 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14779 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14780 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14781 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14782 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14783 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14784 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14785 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14786 puDst->au16[4] = uSum;
14787
14788 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14789 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14790 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14791 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14792 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14793 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14794 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14795 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14796 puDst->au16[8] = uSum;
14797
14798 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14799 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14800 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14801 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14802 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14803 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14804 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14805 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14806 puDst->au16[12] = uSum;
14807}
14808
14809
14810/*
14811 * PMULDQ / VPMULDQ
14812 */
14813IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14814{
14815 RTUINT128U uSrc1 = *puDst;
14816
14817 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14818 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14819}
14820
14821IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14822{
14823 RTUINT128U uSrc1 = *puSrc1;
14824 RTUINT128U uSrc2 = *puSrc2;
14825
14826 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14827 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14828}
14829
14830IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14831{
14832 RTUINT256U uSrc1 = *puSrc1;
14833 RTUINT256U uSrc2 = *puSrc2;
14834
14835 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14836 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14837 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14838 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14839}
14840
14841
14842/*
14843 * PMULUDQ / VPMULUDQ
14844 */
14845#ifdef IEM_WITHOUT_ASSEMBLY
14846
14847IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(uint64_t *puDst, uint64_t const *puSrc))
14848{
14849 RTUINT64U uSrc1 = { *puDst };
14850 RTUINT64U uSrc2 = { *puSrc };
14851 ASMCompilerBarrier();
14852 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14853}
14854
14855
14856IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14857{
14858 RTUINT128U uSrc1 = *puDst;
14859 RTUINT128U uSrc2 = *puSrc;
14860 ASMCompilerBarrier();
14861 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14862 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14863}
14864
14865#endif
14866
14867IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14868{
14869 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14870 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14871 ASMCompilerBarrier();
14872 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14873 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14874}
14875
14876
14877IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14878{
14879 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14880 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14881 ASMCompilerBarrier();
14882 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14883 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14884 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14885 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14886}
14887
14888
14889/*
14890 * UNPCKLPS / VUNPCKLPS
14891 */
14892#ifdef IEM_WITHOUT_ASSEMBLY
14893IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14894{
14895 RTUINT128U uSrc1 = *puDst;
14896 RTUINT128U uSrc2 = *puSrc;
14897 ASMCompilerBarrier();
14898 puDst->au32[0] = uSrc1.au32[0];
14899 puDst->au32[1] = uSrc2.au32[0];
14900 puDst->au32[2] = uSrc1.au32[1];
14901 puDst->au32[3] = uSrc2.au32[1];
14902}
14903
14904#endif
14905
14906IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14907{
14908 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14909 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14910 ASMCompilerBarrier();
14911 puDst->au32[0] = uSrc1.au32[0];
14912 puDst->au32[1] = uSrc2.au32[0];
14913 puDst->au32[2] = uSrc1.au32[1];
14914 puDst->au32[3] = uSrc2.au32[1];
14915}
14916
14917
14918IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14919{
14920 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14921 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14922 ASMCompilerBarrier();
14923 puDst->au32[0] = uSrc1.au32[0];
14924 puDst->au32[1] = uSrc2.au32[0];
14925 puDst->au32[2] = uSrc1.au32[1];
14926 puDst->au32[3] = uSrc2.au32[1];
14927
14928 puDst->au32[4] = uSrc1.au32[4];
14929 puDst->au32[5] = uSrc2.au32[4];
14930 puDst->au32[6] = uSrc1.au32[5];
14931 puDst->au32[7] = uSrc2.au32[5];
14932}
14933
14934
14935/*
14936 * UNPCKLPD / VUNPCKLPD
14937 */
14938#ifdef IEM_WITHOUT_ASSEMBLY
14939IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14940{
14941 RTUINT128U uSrc1 = *puDst;
14942 RTUINT128U uSrc2 = *puSrc;
14943 ASMCompilerBarrier();
14944 puDst->au64[0] = uSrc1.au64[0];
14945 puDst->au64[1] = uSrc2.au64[0];
14946}
14947
14948#endif
14949
14950IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14951{
14952 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14953 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14954 ASMCompilerBarrier();
14955 puDst->au64[0] = uSrc1.au64[0];
14956 puDst->au64[1] = uSrc2.au64[0];
14957}
14958
14959
14960IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14961{
14962 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14963 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14964 ASMCompilerBarrier();
14965 puDst->au64[0] = uSrc1.au64[0];
14966 puDst->au64[1] = uSrc2.au64[0];
14967 puDst->au64[2] = uSrc1.au64[2];
14968 puDst->au64[3] = uSrc2.au64[2];
14969}
14970
14971
14972/*
14973 * UNPCKHPS / VUNPCKHPS
14974 */
14975#ifdef IEM_WITHOUT_ASSEMBLY
14976IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14977{
14978 RTUINT128U uSrc1 = *puDst;
14979 RTUINT128U uSrc2 = *puSrc;
14980 ASMCompilerBarrier();
14981 puDst->au32[0] = uSrc1.au32[2];
14982 puDst->au32[1] = uSrc2.au32[2];
14983 puDst->au32[2] = uSrc1.au32[3];
14984 puDst->au32[3] = uSrc2.au32[3];
14985}
14986
14987#endif
14988
14989IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14990{
14991 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14992 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14993 ASMCompilerBarrier();
14994 puDst->au32[0] = uSrc1.au32[2];
14995 puDst->au32[1] = uSrc2.au32[2];
14996 puDst->au32[2] = uSrc1.au32[3];
14997 puDst->au32[3] = uSrc2.au32[3];
14998}
14999
15000
15001IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15002{
15003 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15004 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15005 ASMCompilerBarrier();
15006 puDst->au32[0] = uSrc1.au32[2];
15007 puDst->au32[1] = uSrc2.au32[2];
15008 puDst->au32[2] = uSrc1.au32[3];
15009 puDst->au32[3] = uSrc2.au32[3];
15010
15011 puDst->au32[4] = uSrc1.au32[6];
15012 puDst->au32[5] = uSrc2.au32[6];
15013 puDst->au32[6] = uSrc1.au32[7];
15014 puDst->au32[7] = uSrc2.au32[7];
15015}
15016
15017
15018/*
15019 * UNPCKHPD / VUNPCKHPD
15020 */
15021#ifdef IEM_WITHOUT_ASSEMBLY
15022IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15023{
15024 RTUINT128U uSrc1 = *puDst;
15025 RTUINT128U uSrc2 = *puSrc;
15026 ASMCompilerBarrier();
15027 puDst->au64[0] = uSrc1.au64[1];
15028 puDst->au64[1] = uSrc2.au64[1];
15029}
15030
15031#endif
15032
15033IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15034{
15035 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15036 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15037 ASMCompilerBarrier();
15038 puDst->au64[0] = uSrc1.au64[1];
15039 puDst->au64[1] = uSrc2.au64[1];
15040}
15041
15042
15043IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15044{
15045 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15046 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15047 ASMCompilerBarrier();
15048 puDst->au64[0] = uSrc1.au64[1];
15049 puDst->au64[1] = uSrc2.au64[1];
15050 puDst->au64[2] = uSrc1.au64[3];
15051 puDst->au64[3] = uSrc2.au64[3];
15052}
15053
15054
15055/*
15056 * CRC32 (SEE 4.2).
15057 */
15058
15059IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
15060{
15061 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15062}
15063
15064
15065IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15066{
15067 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15068}
15069
15070IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15071{
15072 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15073}
15074
15075IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15076{
15077 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15078}
15079
15080
15081/*
15082 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15083 */
15084#ifdef IEM_WITHOUT_ASSEMBLY
15085IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15086{
15087 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15088 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15089 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15090 fEfl |= X86_EFL_ZF;
15091 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15092 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15093 fEfl |= X86_EFL_CF;
15094 *pfEFlags = fEfl;
15095}
15096#endif
15097
15098IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15099{
15100 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15101 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15102 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15103 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15104 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15105 fEfl |= X86_EFL_ZF;
15106 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15107 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15108 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15109 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15110 fEfl |= X86_EFL_CF;
15111 *pfEFlags = fEfl;
15112}
15113
15114
15115/* Worker for VEX.128 vtestp[s|d]. */
15116static void iemAImpl_vtestp_sd_u128_worker(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint64_t fSignMask, uint32_t *pfEFlags)
15117{
15118 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15119 RTUINT128U uTemp;
15120 uTemp.au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
15121 uTemp.au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
15122 if ((( uTemp.au64[0]
15123 | uTemp.au64[1]) & fSignMask) == 0)
15124 fEfl |= X86_EFL_ZF;
15125 uTemp.au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
15126 uTemp.au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
15127 if ((( uTemp.au64[0]
15128 | uTemp.au64[1]) & fSignMask) == 0)
15129 fEfl |= X86_EFL_CF;
15130 *pfEFlags = fEfl;
15131}
15132
15133
15134/* Worker for VEX.256 vtestp[s|d]. */
15135static void iemAImpl_vtestp_sd_u256_worker(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint64_t fSignMask, uint32_t *pfEFlags)
15136{
15137 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15138 RTUINT256U uTemp;
15139 uTemp.au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
15140 uTemp.au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
15141 uTemp.au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
15142 uTemp.au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
15143 if ((( uTemp.au64[0]
15144 | uTemp.au64[1]
15145 | uTemp.au64[2]
15146 | uTemp.au64[3]) & fSignMask) == 0)
15147 fEfl |= X86_EFL_ZF;
15148 uTemp.au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
15149 uTemp.au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
15150 uTemp.au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
15151 uTemp.au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
15152 if ((( uTemp.au64[0]
15153 | uTemp.au64[1]
15154 | uTemp.au64[2]
15155 | uTemp.au64[3]) & fSignMask) == 0)
15156 fEfl |= X86_EFL_CF;
15157 *pfEFlags = fEfl;
15158}
15159
15160
15161/*
15162 * VTESTPS
15163 */
15164IEM_DECL_IMPL_DEF(void, iemAImpl_vtestps_u128_fallback,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15165{
15166 uint64_t const fSignMask = RT_BIT_64(63) | RT_BIT_64(31);
15167 return iemAImpl_vtestp_sd_u128_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15168}
15169
15170
15171IEM_DECL_IMPL_DEF(void, iemAImpl_vtestps_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15172{
15173 uint64_t const fSignMask = RT_BIT_64(63) | RT_BIT_64(31);
15174 return iemAImpl_vtestp_sd_u256_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15175}
15176
15177
15178/*
15179 * VTESTPD
15180 */
15181IEM_DECL_IMPL_DEF(void, iemAImpl_vtestpd_u128_fallback,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15182{
15183 uint64_t const fSignMask = RT_BIT_64(63);
15184 return iemAImpl_vtestp_sd_u128_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15185}
15186
15187
15188IEM_DECL_IMPL_DEF(void, iemAImpl_vtestpd_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15189{
15190 uint64_t const fSignMask = RT_BIT_64(63);
15191 return iemAImpl_vtestp_sd_u256_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15192}
15193
15194
15195/*
15196 * PMOVSXBW / VPMOVSXBW
15197 */
15198IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15199{
15200 RTUINT64U uSrc1 = { uSrc };
15201 puDst->ai16[0] = uSrc1.ai8[0];
15202 puDst->ai16[1] = uSrc1.ai8[1];
15203 puDst->ai16[2] = uSrc1.ai8[2];
15204 puDst->ai16[3] = uSrc1.ai8[3];
15205 puDst->ai16[4] = uSrc1.ai8[4];
15206 puDst->ai16[5] = uSrc1.ai8[5];
15207 puDst->ai16[6] = uSrc1.ai8[6];
15208 puDst->ai16[7] = uSrc1.ai8[7];
15209}
15210
15211
15212IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15213{
15214 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15215 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15216 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15217 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15218 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15219 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15220 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15221 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15222 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15223 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15224 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15225 puDst->ai16[10] = uSrc1.ai8[10];
15226 puDst->ai16[11] = uSrc1.ai8[11];
15227 puDst->ai16[12] = uSrc1.ai8[12];
15228 puDst->ai16[13] = uSrc1.ai8[13];
15229 puDst->ai16[14] = uSrc1.ai8[14];
15230 puDst->ai16[15] = uSrc1.ai8[15];
15231}
15232
15233
15234/*
15235 * PMOVSXBD / VPMOVSXBD
15236 */
15237IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15238{
15239 RTUINT32U uSrc1 = { uSrc };
15240 puDst->ai32[0] = uSrc1.ai8[0];
15241 puDst->ai32[1] = uSrc1.ai8[1];
15242 puDst->ai32[2] = uSrc1.ai8[2];
15243 puDst->ai32[3] = uSrc1.ai8[3];
15244}
15245
15246
15247IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15248{
15249 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15250 puDst->ai32[0] = uSrc1.ai8[0];
15251 puDst->ai32[1] = uSrc1.ai8[1];
15252 puDst->ai32[2] = uSrc1.ai8[2];
15253 puDst->ai32[3] = uSrc1.ai8[3];
15254 puDst->ai32[4] = uSrc1.ai8[4];
15255 puDst->ai32[5] = uSrc1.ai8[5];
15256 puDst->ai32[6] = uSrc1.ai8[6];
15257 puDst->ai32[7] = uSrc1.ai8[7];
15258}
15259
15260
15261/*
15262 * PMOVSXBQ / VPMOVSXBQ
15263 */
15264IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15265{
15266 RTUINT16U uSrc1 = { uSrc };
15267 puDst->ai64[0] = uSrc1.ai8[0];
15268 puDst->ai64[1] = uSrc1.ai8[1];
15269}
15270
15271
15272IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15273{
15274 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15275 puDst->ai64[0] = uSrc1.ai8[0];
15276 puDst->ai64[1] = uSrc1.ai8[1];
15277 puDst->ai64[2] = uSrc1.ai8[2];
15278 puDst->ai64[3] = uSrc1.ai8[3];
15279}
15280
15281
15282/*
15283 * PMOVSXWD / VPMOVSXWD
15284 */
15285IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15286{
15287 RTUINT64U uSrc1 = { uSrc };
15288 puDst->ai32[0] = uSrc1.ai16[0];
15289 puDst->ai32[1] = uSrc1.ai16[1];
15290 puDst->ai32[2] = uSrc1.ai16[2];
15291 puDst->ai32[3] = uSrc1.ai16[3];
15292}
15293
15294
15295IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15296{
15297 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15298 puDst->ai32[0] = uSrc1.ai16[0];
15299 puDst->ai32[1] = uSrc1.ai16[1];
15300 puDst->ai32[2] = uSrc1.ai16[2];
15301 puDst->ai32[3] = uSrc1.ai16[3];
15302 puDst->ai32[4] = uSrc1.ai16[4];
15303 puDst->ai32[5] = uSrc1.ai16[5];
15304 puDst->ai32[6] = uSrc1.ai16[6];
15305 puDst->ai32[7] = uSrc1.ai16[7];
15306}
15307
15308
15309/*
15310 * PMOVSXWQ / VPMOVSXWQ
15311 */
15312IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15313{
15314 RTUINT32U uSrc1 = { uSrc };
15315 puDst->ai64[0] = uSrc1.ai16[0];
15316 puDst->ai64[1] = uSrc1.ai16[1];
15317}
15318
15319
15320IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15321{
15322 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15323 puDst->ai64[0] = uSrc1.ai16[0];
15324 puDst->ai64[1] = uSrc1.ai16[1];
15325 puDst->ai64[2] = uSrc1.ai16[2];
15326 puDst->ai64[3] = uSrc1.ai16[3];
15327}
15328
15329
15330/*
15331 * PMOVSXDQ / VPMOVSXDQ
15332 */
15333IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15334{
15335 RTUINT64U uSrc1 = { uSrc };
15336 puDst->ai64[0] = uSrc1.ai32[0];
15337 puDst->ai64[1] = uSrc1.ai32[1];
15338}
15339
15340
15341IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15342{
15343 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15344 puDst->ai64[0] = uSrc1.ai32[0];
15345 puDst->ai64[1] = uSrc1.ai32[1];
15346 puDst->ai64[2] = uSrc1.ai32[2];
15347 puDst->ai64[3] = uSrc1.ai32[3];
15348}
15349
15350
15351/*
15352 * PMOVZXBW / VPMOVZXBW
15353 */
15354IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15355{
15356 RTUINT64U uSrc1 = { uSrc };
15357 puDst->au16[0] = uSrc1.au8[0];
15358 puDst->au16[1] = uSrc1.au8[1];
15359 puDst->au16[2] = uSrc1.au8[2];
15360 puDst->au16[3] = uSrc1.au8[3];
15361 puDst->au16[4] = uSrc1.au8[4];
15362 puDst->au16[5] = uSrc1.au8[5];
15363 puDst->au16[6] = uSrc1.au8[6];
15364 puDst->au16[7] = uSrc1.au8[7];
15365}
15366
15367
15368IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15369{
15370 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15371 puDst->au16[ 0] = uSrc1.au8[ 0];
15372 puDst->au16[ 1] = uSrc1.au8[ 1];
15373 puDst->au16[ 2] = uSrc1.au8[ 2];
15374 puDst->au16[ 3] = uSrc1.au8[ 3];
15375 puDst->au16[ 4] = uSrc1.au8[ 4];
15376 puDst->au16[ 5] = uSrc1.au8[ 5];
15377 puDst->au16[ 6] = uSrc1.au8[ 6];
15378 puDst->au16[ 7] = uSrc1.au8[ 7];
15379 puDst->au16[ 8] = uSrc1.au8[ 8];
15380 puDst->au16[ 9] = uSrc1.au8[ 9];
15381 puDst->au16[10] = uSrc1.au8[10];
15382 puDst->au16[11] = uSrc1.au8[11];
15383 puDst->au16[12] = uSrc1.au8[12];
15384 puDst->au16[13] = uSrc1.au8[13];
15385 puDst->au16[14] = uSrc1.au8[14];
15386 puDst->au16[15] = uSrc1.au8[15];
15387}
15388
15389
15390/*
15391 * PMOVZXBD / VPMOVZXBD
15392 */
15393IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15394{
15395 RTUINT32U uSrc1 = { uSrc };
15396 puDst->au32[0] = uSrc1.au8[0];
15397 puDst->au32[1] = uSrc1.au8[1];
15398 puDst->au32[2] = uSrc1.au8[2];
15399 puDst->au32[3] = uSrc1.au8[3];
15400}
15401
15402
15403IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15404{
15405 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15406 puDst->au32[0] = uSrc1.au8[0];
15407 puDst->au32[1] = uSrc1.au8[1];
15408 puDst->au32[2] = uSrc1.au8[2];
15409 puDst->au32[3] = uSrc1.au8[3];
15410 puDst->au32[4] = uSrc1.au8[4];
15411 puDst->au32[5] = uSrc1.au8[5];
15412 puDst->au32[6] = uSrc1.au8[6];
15413 puDst->au32[7] = uSrc1.au8[7];
15414}
15415
15416
15417/*
15418 * PMOVZXBQ / VPMOVZXBQ
15419 */
15420IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15421{
15422 RTUINT16U uSrc1 = { uSrc };
15423 puDst->au64[0] = uSrc1.au8[0];
15424 puDst->au64[1] = uSrc1.au8[1];
15425}
15426
15427
15428IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15429{
15430 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15431 puDst->au64[0] = uSrc1.au8[0];
15432 puDst->au64[1] = uSrc1.au8[1];
15433 puDst->au64[2] = uSrc1.au8[2];
15434 puDst->au64[3] = uSrc1.au8[3];
15435}
15436
15437
15438/*
15439 * PMOVZXWD / VPMOVZXWD
15440 */
15441IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15442{
15443 RTUINT64U uSrc1 = { uSrc };
15444 puDst->au32[0] = uSrc1.au16[0];
15445 puDst->au32[1] = uSrc1.au16[1];
15446 puDst->au32[2] = uSrc1.au16[2];
15447 puDst->au32[3] = uSrc1.au16[3];
15448}
15449
15450
15451IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15452{
15453 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15454 puDst->au32[0] = uSrc1.au16[0];
15455 puDst->au32[1] = uSrc1.au16[1];
15456 puDst->au32[2] = uSrc1.au16[2];
15457 puDst->au32[3] = uSrc1.au16[3];
15458 puDst->au32[4] = uSrc1.au16[4];
15459 puDst->au32[5] = uSrc1.au16[5];
15460 puDst->au32[6] = uSrc1.au16[6];
15461 puDst->au32[7] = uSrc1.au16[7];
15462}
15463
15464
15465/*
15466 * PMOVZXWQ / VPMOVZXWQ
15467 */
15468IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15469{
15470 RTUINT32U uSrc1 = { uSrc };
15471 puDst->au64[0] = uSrc1.au16[0];
15472 puDst->au64[1] = uSrc1.au16[1];
15473}
15474
15475
15476IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15477{
15478 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15479 puDst->au64[0] = uSrc1.au16[0];
15480 puDst->au64[1] = uSrc1.au16[1];
15481 puDst->au64[2] = uSrc1.au16[2];
15482 puDst->au64[3] = uSrc1.au16[3];
15483}
15484
15485
15486/*
15487 * PMOVZXDQ / VPMOVZXDQ
15488 */
15489IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15490{
15491 RTUINT64U uSrc1 = { uSrc };
15492 puDst->au64[0] = uSrc1.au32[0];
15493 puDst->au64[1] = uSrc1.au32[1];
15494}
15495
15496
15497IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15498{
15499 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15500 puDst->au64[0] = uSrc1.au32[0];
15501 puDst->au64[1] = uSrc1.au32[1];
15502 puDst->au64[2] = uSrc1.au32[2];
15503 puDst->au64[3] = uSrc1.au32[3];
15504}
15505
15506/**
15507 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15508 * the SoftFloat 32-bit floating point format (float32_t).
15509 *
15510 * This is only a structure format conversion, nothing else.
15511 */
15512DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15513{
15514 float32_t Tmp;
15515 Tmp.v = pr32Val->u;
15516 return Tmp;
15517}
15518
15519
15520/**
15521 * Converts from SoftFloat 32-bit floating point format (float32_t)
15522 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15523 *
15524 * This is only a structure format conversion, nothing else.
15525 */
15526DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15527{
15528 pr32Dst->u = r32XSrc.v;
15529 return pr32Dst;
15530}
15531
15532
15533/**
15534 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15535 * the SoftFloat 64-bit floating point format (float64_t).
15536 *
15537 * This is only a structure format conversion, nothing else.
15538 */
15539DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15540{
15541 float64_t Tmp;
15542 Tmp.v = pr64Val->u;
15543 return Tmp;
15544}
15545
15546
15547/**
15548 * Converts from SoftFloat 64-bit floating point format (float64_t)
15549 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15550 *
15551 * This is only a structure format conversion, nothing else.
15552 */
15553DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15554{
15555 pr64Dst->u = r64XSrc.v;
15556 return pr64Dst;
15557}
15558
15559
15560/** Initializer for the SoftFloat state structure. */
15561# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15562 { \
15563 softfloat_tininess_afterRounding, \
15564 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15565 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15566 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15567 : (uint8_t)softfloat_round_minMag, \
15568 0, \
15569 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15570 32 /* Rounding precision, not relevant for SIMD. */ \
15571 }
15572
15573#ifdef IEM_WITHOUT_ASSEMBLY
15574
15575/**
15576 * Helper for transfering exception to MXCSR and setting the result value
15577 * accordingly.
15578 *
15579 * @returns Updated MXCSR.
15580 * @param pSoftState The SoftFloat state following the operation.
15581 * @param r32Result The result of the SoftFloat operation.
15582 * @param pr32Result Where to store the result for IEM.
15583 * @param fMxcsr The original MXCSR value.
15584 */
15585DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15586 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15587{
15588 iemFpSoftF32ToIprt(pr32Result, r32Result);
15589
15590 uint8_t fXcpt = pSoftState->exceptionFlags;
15591 if ( (fMxcsr & X86_MXCSR_FZ)
15592 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15593 {
15594 /* Underflow masked and flush to zero is set. */
15595 pr32Result->s.uFraction = 0;
15596 pr32Result->s.uExponent = 0;
15597 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15598 }
15599
15600 /* If DAZ is set \#DE is never set. */
15601 if ( fMxcsr & X86_MXCSR_DAZ
15602 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15603 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15604 fXcpt &= ~X86_MXCSR_DE;
15605
15606 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15607}
15608
15609
15610/**
15611 * Helper for transfering exception to MXCSR and setting the result value
15612 * accordingly - ignores Flush-to-Zero.
15613 *
15614 * @returns Updated MXCSR.
15615 * @param pSoftState The SoftFloat state following the operation.
15616 * @param r32Result The result of the SoftFloat operation.
15617 * @param pr32Result Where to store the result for IEM.
15618 * @param fMxcsr The original MXCSR value.
15619 */
15620DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15621 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15622{
15623 iemFpSoftF32ToIprt(pr32Result, r32Result);
15624
15625 uint8_t fXcpt = pSoftState->exceptionFlags;
15626 /* If DAZ is set \#DE is never set. */
15627 if ( fMxcsr & X86_MXCSR_DAZ
15628 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15629 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15630 fXcpt &= ~X86_MXCSR_DE;
15631
15632 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15633}
15634
15635
15636/**
15637 * Helper for transfering exception to MXCSR and setting the result value
15638 * accordingly.
15639 *
15640 * @returns Updated MXCSR.
15641 * @param pSoftState The SoftFloat state following the operation.
15642 * @param r64Result The result of the SoftFloat operation.
15643 * @param pr64Result Where to store the result for IEM.
15644 * @param fMxcsr The original MXCSR value.
15645 */
15646DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15647 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15648{
15649 iemFpSoftF64ToIprt(pr64Result, r64Result);
15650 uint8_t fXcpt = pSoftState->exceptionFlags;
15651 if ( (fMxcsr & X86_MXCSR_FZ)
15652 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15653 {
15654 /* Underflow masked and flush to zero is set. */
15655 iemFpSoftF64ToIprt(pr64Result, r64Result);
15656 pr64Result->s.uFractionHigh = 0;
15657 pr64Result->s.uFractionLow = 0;
15658 pr64Result->s.uExponent = 0;
15659 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15660 }
15661
15662 /* If DAZ is set \#DE is never set. */
15663 if ( fMxcsr & X86_MXCSR_DAZ
15664 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15665 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15666 fXcpt &= ~X86_MXCSR_DE;
15667
15668 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15669}
15670
15671
15672/**
15673 * Helper for transfering exception to MXCSR and setting the result value
15674 * accordingly - ignores Flush-to-Zero.
15675 *
15676 * @returns Updated MXCSR.
15677 * @param pSoftState The SoftFloat state following the operation.
15678 * @param r64Result The result of the SoftFloat operation.
15679 * @param pr64Result Where to store the result for IEM.
15680 * @param fMxcsr The original MXCSR value.
15681 */
15682DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15683 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15684{
15685 iemFpSoftF64ToIprt(pr64Result, r64Result);
15686
15687 uint8_t fXcpt = pSoftState->exceptionFlags;
15688 /* If DAZ is set \#DE is never set. */
15689 if ( fMxcsr & X86_MXCSR_DAZ
15690 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15691 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15692 fXcpt &= ~X86_MXCSR_DE;
15693
15694 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15695}
15696
15697#endif /* IEM_WITHOUT_ASSEMBLY */
15698
15699
15700/**
15701 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15702 * in MXCSR into account.
15703 *
15704 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15705 * @param pr32Val Where to store the result.
15706 * @param fMxcsr The input MXCSR value.
15707 * @param pr32Src The value to use.
15708 */
15709DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15710{
15711 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15712 {
15713 if (fMxcsr & X86_MXCSR_DAZ)
15714 {
15715 /* De-normals are changed to 0. */
15716 pr32Val->s.fSign = pr32Src->s.fSign;
15717 pr32Val->s.uFraction = 0;
15718 pr32Val->s.uExponent = 0;
15719 return 0;
15720 }
15721
15722 *pr32Val = *pr32Src;
15723 return X86_MXCSR_DE;
15724 }
15725
15726 *pr32Val = *pr32Src;
15727 return 0;
15728}
15729
15730
15731/**
15732 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15733 * in MXCSR into account.
15734 *
15735 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15736 * @param pr64Val Where to store the result.
15737 * @param fMxcsr The input MXCSR value.
15738 * @param pr64Src The value to use.
15739 */
15740DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15741{
15742 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15743 {
15744 if (fMxcsr & X86_MXCSR_DAZ)
15745 {
15746 /* De-normals are changed to 0. */
15747 pr64Val->s64.fSign = pr64Src->s.fSign;
15748 pr64Val->s64.uFraction = 0;
15749 pr64Val->s64.uExponent = 0;
15750 return 0;
15751 }
15752
15753 *pr64Val = *pr64Src;
15754 return X86_MXCSR_DE;
15755 }
15756
15757 *pr64Val = *pr64Src;
15758 return 0;
15759}
15760
15761#ifdef IEM_WITHOUT_ASSEMBLY
15762
15763/**
15764 * Validates the given input operands returning whether the operation can continue or whether one
15765 * of the source operands contains a NaN value, setting the output accordingly.
15766 *
15767 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15768 * @param pr32Res Where to store the result in case the operation can't continue.
15769 * @param pr32Val1 The first input operand.
15770 * @param pr32Val2 The second input operand.
15771 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15772 */
15773DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15774{
15775 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15776 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15777 if (cSNan + cQNan == 2)
15778 {
15779 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15780 *pr32Res = *pr32Val1;
15781 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15782 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15783 return true;
15784 }
15785 if (cSNan)
15786 {
15787 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15788 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15789 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15790 *pfMxcsr |= X86_MXCSR_IE;
15791 return true;
15792 }
15793 if (cQNan)
15794 {
15795 /* The QNan operand is placed into the result. */
15796 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15797 return true;
15798 }
15799
15800 Assert(!cQNan && !cSNan);
15801 return false;
15802}
15803
15804
15805/**
15806 * Validates the given double precision input operands returning whether the operation can continue or whether one
15807 * of the source operands contains a NaN value, setting the output accordingly.
15808 *
15809 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15810 * @param pr64Res Where to store the result in case the operation can't continue.
15811 * @param pr64Val1 The first input operand.
15812 * @param pr64Val2 The second input operand.
15813 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15814 */
15815DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15816{
15817 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15818 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15819 if (cSNan + cQNan == 2)
15820 {
15821 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15822 *pr64Res = *pr64Val1;
15823 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15824 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15825 return true;
15826 }
15827 if (cSNan)
15828 {
15829 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15830 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15831 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15832 *pfMxcsr |= X86_MXCSR_IE;
15833 return true;
15834 }
15835 if (cQNan)
15836 {
15837 /* The QNan operand is placed into the result. */
15838 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15839 return true;
15840 }
15841
15842 Assert(!cQNan && !cSNan);
15843 return false;
15844}
15845
15846
15847/**
15848 * Validates the given single input operand returning whether the operation can continue or whether
15849 * contains a NaN value, setting the output accordingly.
15850 *
15851 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15852 * @param pr32Res Where to store the result in case the operation can't continue.
15853 * @param pr32Val The input operand.
15854 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15855 */
15856DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15857{
15858 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15859 {
15860 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15861 *pr32Res = *pr32Val;
15862 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15863 *pfMxcsr |= X86_MXCSR_IE;
15864 return true;
15865 }
15866 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15867 {
15868 /* The QNan operand is placed into the result. */
15869 *pr32Res = *pr32Val;
15870 return true;
15871 }
15872
15873 return false;
15874}
15875
15876
15877/**
15878 * Validates the given double input operand returning whether the operation can continue or whether
15879 * contains a NaN value, setting the output accordingly.
15880 *
15881 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15882 * @param pr64Res Where to store the result in case the operation can't continue.
15883 * @param pr64Val The input operand.
15884 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15885 */
15886DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15887{
15888 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15889 {
15890 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15891 *pr64Res = *pr64Val;
15892 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15893 *pfMxcsr |= X86_MXCSR_IE;
15894 return true;
15895 }
15896 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15897 {
15898 /* The QNan operand is placed into the result. */
15899 *pr64Res = *pr64Val;
15900 return true;
15901 }
15902
15903 return false;
15904}
15905
15906#endif /* IEM_WITHOUT_ASSEMBLY */
15907
15908/**
15909 * ADDPS
15910 */
15911#ifdef IEM_WITHOUT_ASSEMBLY
15912static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15913{
15914 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15915 return fMxcsr;
15916
15917 RTFLOAT32U r32Src1, r32Src2;
15918 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15919 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15920 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15921 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15922 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15923}
15924
15925
15926IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15927{
15928 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15929 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15930 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15931 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15932}
15933#endif
15934
15935
15936/**
15937 * ADDSS
15938 */
15939#ifdef IEM_WITHOUT_ASSEMBLY
15940IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15941{
15942 pResult->ar32[1] = puSrc1->ar32[1];
15943 pResult->ar32[2] = puSrc1->ar32[2];
15944 pResult->ar32[3] = puSrc1->ar32[3];
15945 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15946}
15947#endif
15948
15949
15950/**
15951 * ADDPD
15952 */
15953#ifdef IEM_WITHOUT_ASSEMBLY
15954static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15955{
15956 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15957 return fMxcsr;
15958
15959 RTFLOAT64U r64Src1, r64Src2;
15960 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15961 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15962 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15963 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15964 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15965}
15966
15967
15968IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15969{
15970 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15971 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15972}
15973#endif
15974
15975
15976/**
15977 * ADDSD
15978 */
15979#ifdef IEM_WITHOUT_ASSEMBLY
15980IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15981{
15982 pResult->ar64[1] = puSrc1->ar64[1];
15983 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15984}
15985#endif
15986
15987
15988/**
15989 * MULPS
15990 */
15991#ifdef IEM_WITHOUT_ASSEMBLY
15992static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15993{
15994 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15995 return fMxcsr;
15996
15997 RTFLOAT32U r32Src1, r32Src2;
15998 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15999 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16000 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16001 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16002 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16003}
16004
16005
16006IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16007{
16008 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16009 | iemAImpl_mulps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16010 | iemAImpl_mulps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16011 | iemAImpl_mulps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16012}
16013#endif
16014
16015
16016/**
16017 * MULSS
16018 */
16019#ifdef IEM_WITHOUT_ASSEMBLY
16020IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16021{
16022 pResult->ar32[1] = puSrc1->ar32[1];
16023 pResult->ar32[2] = puSrc1->ar32[2];
16024 pResult->ar32[3] = puSrc1->ar32[3];
16025 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16026}
16027#endif
16028
16029
16030/**
16031 * MULPD
16032 */
16033#ifdef IEM_WITHOUT_ASSEMBLY
16034static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16035{
16036 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16037 return fMxcsr;
16038
16039 RTFLOAT64U r64Src1, r64Src2;
16040 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16041 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16042 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16043 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16044 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16045}
16046
16047
16048IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16049{
16050 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16051 | iemAImpl_mulpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16052}
16053#endif
16054
16055
16056/**
16057 * MULSD
16058 */
16059#ifdef IEM_WITHOUT_ASSEMBLY
16060IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16061{
16062 pResult->ar64[1] = puSrc1->ar64[1];
16063 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16064}
16065#endif
16066
16067
16068/**
16069 * SUBPS
16070 */
16071#ifdef IEM_WITHOUT_ASSEMBLY
16072static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16073{
16074 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16075 return fMxcsr;
16076
16077 RTFLOAT32U r32Src1, r32Src2;
16078 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16079 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16080 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16081 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16082 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16083}
16084
16085
16086IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16087{
16088 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16089 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16090 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16091 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16092}
16093#endif
16094
16095
16096/**
16097 * SUBSS
16098 */
16099#ifdef IEM_WITHOUT_ASSEMBLY
16100IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16101{
16102 pResult->ar32[1] = puSrc1->ar32[1];
16103 pResult->ar32[2] = puSrc1->ar32[2];
16104 pResult->ar32[3] = puSrc1->ar32[3];
16105 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16106}
16107#endif
16108
16109
16110/**
16111 * SUBPD
16112 */
16113#ifdef IEM_WITHOUT_ASSEMBLY
16114static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16115{
16116 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16117 return fMxcsr;
16118
16119 RTFLOAT64U r64Src1, r64Src2;
16120 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16121 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16122 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16123 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16124 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16125}
16126
16127
16128IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16129{
16130 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16131 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16132}
16133#endif
16134
16135
16136/**
16137 * SUBSD
16138 */
16139#ifdef IEM_WITHOUT_ASSEMBLY
16140IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16141{
16142 pResult->ar64[1] = puSrc1->ar64[1];
16143 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16144}
16145#endif
16146
16147
16148/**
16149 * MINPS
16150 */
16151#ifdef IEM_WITHOUT_ASSEMBLY
16152static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16153{
16154 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16155 {
16156 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16157 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16158 return fMxcsr | X86_MXCSR_IE;
16159 }
16160
16161 RTFLOAT32U r32Src1, r32Src2;
16162 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16163 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16164 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16165 {
16166 *pr32Res = r32Src2;
16167 return fMxcsr;
16168 }
16169
16170 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16171 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16172 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16173 fLe
16174 ? iemFpSoftF32FromIprt(&r32Src1)
16175 : iemFpSoftF32FromIprt(&r32Src2),
16176 pr32Res, fMxcsr);
16177}
16178
16179
16180IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16181{
16182 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16183 | iemAImpl_minps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16184 | iemAImpl_minps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16185 | iemAImpl_minps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16186}
16187#endif
16188
16189
16190/**
16191 * MINSS
16192 */
16193#ifdef IEM_WITHOUT_ASSEMBLY
16194IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16195{
16196 pResult->ar32[1] = puSrc1->ar32[1];
16197 pResult->ar32[2] = puSrc1->ar32[2];
16198 pResult->ar32[3] = puSrc1->ar32[3];
16199 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16200}
16201#endif
16202
16203
16204/**
16205 * MINPD
16206 */
16207#ifdef IEM_WITHOUT_ASSEMBLY
16208static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16209{
16210 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16211 {
16212 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16213 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16214 return fMxcsr | X86_MXCSR_IE;
16215 }
16216
16217 RTFLOAT64U r64Src1, r64Src2;
16218 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16219 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16220 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16221 {
16222 *pr64Res = r64Src2;
16223 return fMxcsr;
16224 }
16225
16226 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16227 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16228 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16229 fLe
16230 ? iemFpSoftF64FromIprt(&r64Src1)
16231 : iemFpSoftF64FromIprt(&r64Src2),
16232 pr64Res, fMxcsr);
16233}
16234
16235
16236IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16237{
16238 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16239 | iemAImpl_minpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16240}
16241#endif
16242
16243
16244/**
16245 * MINSD
16246 */
16247#ifdef IEM_WITHOUT_ASSEMBLY
16248IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16249{
16250 pResult->ar64[1] = puSrc1->ar64[1];
16251 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16252}
16253#endif
16254
16255
16256/**
16257 * DIVPS
16258 */
16259#ifdef IEM_WITHOUT_ASSEMBLY
16260static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16261{
16262 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16263 return fMxcsr;
16264
16265 RTFLOAT32U r32Src1, r32Src2;
16266 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16267 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16268 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16269 {
16270 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16271 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16272 {
16273 *pr32Res = g_ar32QNaN[1];
16274 return fMxcsr | X86_MXCSR_IE;
16275 }
16276 else if (RTFLOAT32U_IS_INF(&r32Src1))
16277 {
16278 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16279 return fMxcsr;
16280 }
16281 else
16282 {
16283 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16284 return fMxcsr | X86_MXCSR_ZE;
16285 }
16286 }
16287
16288 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16289 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16290 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16291}
16292
16293
16294IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16295{
16296 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16297 | iemAImpl_divps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16298 | iemAImpl_divps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16299 | iemAImpl_divps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16300}
16301#endif
16302
16303
16304/**
16305 * DIVSS
16306 */
16307#ifdef IEM_WITHOUT_ASSEMBLY
16308IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16309{
16310 pResult->ar32[1] = puSrc1->ar32[1];
16311 pResult->ar32[2] = puSrc1->ar32[2];
16312 pResult->ar32[3] = puSrc1->ar32[3];
16313 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16314}
16315#endif
16316
16317
16318/**
16319 * DIVPD
16320 */
16321#ifdef IEM_WITHOUT_ASSEMBLY
16322static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16323{
16324 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16325 return fMxcsr;
16326
16327 RTFLOAT64U r64Src1, r64Src2;
16328 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16329 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16330 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16331 {
16332 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16333 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16334 {
16335 *pr64Res = g_ar64QNaN[1];
16336 return fMxcsr | X86_MXCSR_IE;
16337 }
16338 else if (RTFLOAT64U_IS_INF(&r64Src1))
16339 {
16340 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16341 return fMxcsr;
16342 }
16343 else
16344 {
16345 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16346 return fMxcsr | X86_MXCSR_ZE;
16347 }
16348 }
16349
16350 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16351 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16352 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16353}
16354
16355
16356IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16357{
16358 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16359 | iemAImpl_divpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16360}
16361#endif
16362
16363
16364/**
16365 * DIVSD
16366 */
16367#ifdef IEM_WITHOUT_ASSEMBLY
16368IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16369{
16370 pResult->ar64[1] = puSrc1->ar64[1];
16371 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16372}
16373#endif
16374
16375
16376/**
16377 * MAXPS
16378 */
16379#ifdef IEM_WITHOUT_ASSEMBLY
16380static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16381{
16382 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16383 {
16384 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16385 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16386 return fMxcsr | X86_MXCSR_IE;
16387 }
16388
16389 RTFLOAT32U r32Src1, r32Src2;
16390 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16391 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16392 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16393 {
16394 *pr32Res = r32Src2;
16395 return fMxcsr;
16396 }
16397
16398 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16399 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16400 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16401 fLe
16402 ? iemFpSoftF32FromIprt(&r32Src2)
16403 : iemFpSoftF32FromIprt(&r32Src1),
16404 pr32Res, fMxcsr);
16405}
16406
16407
16408IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16409{
16410 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16411 | iemAImpl_maxps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16412 | iemAImpl_maxps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16413 | iemAImpl_maxps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16414}
16415#endif
16416
16417
16418/**
16419 * MAXSS
16420 */
16421#ifdef IEM_WITHOUT_ASSEMBLY
16422IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16423{
16424 pResult->ar32[1] = puSrc1->ar32[1];
16425 pResult->ar32[2] = puSrc1->ar32[2];
16426 pResult->ar32[3] = puSrc1->ar32[3];
16427 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16428}
16429#endif
16430
16431
16432/**
16433 * MAXPD
16434 */
16435#ifdef IEM_WITHOUT_ASSEMBLY
16436static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16437{
16438 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16439 {
16440 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16441 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16442 return fMxcsr | X86_MXCSR_IE;
16443 }
16444
16445 RTFLOAT64U r64Src1, r64Src2;
16446 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16447 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16448 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16449 {
16450 *pr64Res = r64Src2;
16451 return fMxcsr;
16452 }
16453
16454 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16455 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16456 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16457 fLe
16458 ? iemFpSoftF64FromIprt(&r64Src2)
16459 : iemFpSoftF64FromIprt(&r64Src1),
16460 pr64Res, fMxcsr);
16461}
16462
16463
16464IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16465{
16466 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16467 | iemAImpl_maxpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16468}
16469#endif
16470
16471
16472/**
16473 * MAXSD
16474 */
16475#ifdef IEM_WITHOUT_ASSEMBLY
16476IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16477{
16478 pResult->ar64[1] = puSrc1->ar64[1];
16479 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16480}
16481#endif
16482
16483
16484/**
16485 * CVTSS2SD
16486 */
16487#ifdef IEM_WITHOUT_ASSEMBLY
16488static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16489{
16490 RTFLOAT32U r32Src1;
16491 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16492
16493 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16494 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16495 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16496}
16497
16498
16499IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2sd_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16500{
16501 pResult->ar64[1] = puSrc1->ar64[1];
16502 return iemAImpl_cvtss2sd_u128_r32_worker(&pResult->ar64[0], uMxCsrIn, pr32Src2);
16503}
16504#endif
16505
16506
16507/**
16508 * CVTSD2SS
16509 */
16510#ifdef IEM_WITHOUT_ASSEMBLY
16511static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16512{
16513 RTFLOAT64U r64Src1;
16514 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16515
16516 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16517 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16518 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16519}
16520
16521
16522IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2ss_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16523{
16524 pResult->ar32[1] = puSrc1->ar32[1];
16525 pResult->ar32[2] = puSrc1->ar32[2];
16526 pResult->ar32[3] = puSrc1->ar32[3];
16527 return iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->ar32[0], uMxCsrIn, pr64Src2);
16528}
16529#endif
16530
16531
16532/**
16533 * HADDPS
16534 */
16535#ifdef IEM_WITHOUT_ASSEMBLY
16536IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16537{
16538 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16539 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16540 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16541 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16542}
16543#endif
16544
16545
16546/**
16547 * HADDPD
16548 */
16549#ifdef IEM_WITHOUT_ASSEMBLY
16550IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16551{
16552 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16553 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16554}
16555#endif
16556
16557
16558/**
16559 * HSUBPS
16560 */
16561#ifdef IEM_WITHOUT_ASSEMBLY
16562IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16563{
16564 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16565 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16566 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16567 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16568}
16569#endif
16570
16571
16572/**
16573 * HSUBPD
16574 */
16575#ifdef IEM_WITHOUT_ASSEMBLY
16576IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16577{
16578 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16579 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16580}
16581#endif
16582
16583
16584/**
16585 * SQRTPS
16586 */
16587#ifdef IEM_WITHOUT_ASSEMBLY
16588static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16589{
16590 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16591 return fMxcsr;
16592
16593 RTFLOAT32U r32Src;
16594 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16595 if (RTFLOAT32U_IS_ZERO(&r32Src))
16596 {
16597 *pr32Res = r32Src;
16598 return fMxcsr;
16599 }
16600 else if (r32Src.s.fSign)
16601 {
16602 *pr32Res = g_ar32QNaN[1];
16603 return fMxcsr | X86_MXCSR_IE;
16604 }
16605
16606 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16607 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16608 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16609}
16610
16611
16612IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16613{
16614 RT_NOREF(puSrc1);
16615
16616 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16617 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16618 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16619 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16620}
16621#endif
16622
16623
16624/**
16625 * SQRTSS
16626 */
16627#ifdef IEM_WITHOUT_ASSEMBLY
16628IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16629{
16630 pResult->ar32[1] = puSrc1->ar32[1];
16631 pResult->ar32[2] = puSrc1->ar32[2];
16632 pResult->ar32[3] = puSrc1->ar32[3];
16633 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16634}
16635#endif
16636
16637
16638/**
16639 * SQRTPD
16640 */
16641#ifdef IEM_WITHOUT_ASSEMBLY
16642static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16643{
16644 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16645 return fMxcsr;
16646
16647 RTFLOAT64U r64Src;
16648 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16649 if (RTFLOAT64U_IS_ZERO(&r64Src))
16650 {
16651 *pr64Res = r64Src;
16652 return fMxcsr;
16653 }
16654 else if (r64Src.s.fSign)
16655 {
16656 *pr64Res = g_ar64QNaN[1];
16657 return fMxcsr | X86_MXCSR_IE;
16658 }
16659
16660 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16661 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16662 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16663}
16664
16665
16666IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16667{
16668 RT_NOREF(puSrc1);
16669
16670 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar64[0])
16671 | iemAImpl_sqrtpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[1]);
16672}
16673#endif
16674
16675
16676/**
16677 * SQRTSD
16678 */
16679#ifdef IEM_WITHOUT_ASSEMBLY
16680IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16681{
16682 pResult->ar64[1] = puSrc1->ar64[1];
16683 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, pr64Src2);
16684}
16685#endif
16686
16687
16688#ifdef IEM_WITHOUT_ASSEMBLY
16689/**
16690 * RSQRTPS
16691 */
16692static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16693{
16694 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16695 return fMxcsr;
16696
16697 RTFLOAT32U r32Src;
16698 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16699 if (RTFLOAT32U_IS_ZERO(&r32Src))
16700 {
16701 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16702 return fMxcsr;
16703 }
16704 else if (r32Src.s.fSign)
16705 {
16706 *pr32Res = g_ar32QNaN[1];
16707 return fMxcsr | X86_MXCSR_IE;
16708 }
16709
16710 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16711 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16712 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16713}
16714
16715
16716IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16717{
16718 RT_NOREF(puSrc1);
16719
16720 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16721 | iemAImpl_rsqrt_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16722 | iemAImpl_rsqrt_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16723 | iemAImpl_rsqrt_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16724}
16725
16726
16727/**
16728 * RSQRTSS
16729 */
16730IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16731{
16732 pResult->ar32[1] = puSrc1->ar32[1];
16733 pResult->ar32[2] = puSrc1->ar32[2];
16734 pResult->ar32[3] = puSrc1->ar32[3];
16735 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16736}
16737#endif
16738
16739
16740/**
16741 * RCPPS
16742 */
16743#ifdef IEM_WITHOUT_ASSEMBLY
16744static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16745{
16746 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16747 return fMxcsr;
16748
16749 RTFLOAT32U r32Src;
16750 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16751 if (RTFLOAT32U_IS_ZERO(&r32Src))
16752 {
16753 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16754 return fMxcsr;
16755 }
16756
16757 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16758 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16759 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16760}
16761
16762
16763IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16764{
16765 RT_NOREF(puSrc1);
16766
16767 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16768 | iemAImpl_rcp_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16769 | iemAImpl_rcp_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16770 | iemAImpl_rcp_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16771}
16772
16773
16774/**
16775 * RCPSS
16776 */
16777IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16778{
16779 pResult->ar32[1] = puSrc1->ar32[1];
16780 pResult->ar32[2] = puSrc1->ar32[2];
16781 pResult->ar32[3] = puSrc1->ar32[3];
16782 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16783}
16784#endif
16785
16786
16787/**
16788 * ADDSUBPS
16789 */
16790#ifdef IEM_WITHOUT_ASSEMBLY
16791IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16792{
16793 RT_NOREF(puSrc1);
16794
16795 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16796 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16797 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16798 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16799}
16800#endif
16801
16802
16803/**
16804 * ADDSUBPD
16805 */
16806#ifdef IEM_WITHOUT_ASSEMBLY
16807IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16808{
16809 RT_NOREF(puSrc1);
16810
16811 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16812 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16813}
16814#endif
16815
16816
16817/**
16818 * CVTPD2PS
16819 */
16820#ifdef IEM_WITHOUT_ASSEMBLY
16821static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16822{
16823 RTFLOAT64U r64Src1;
16824 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16825
16826 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16827 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16828 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16829}
16830
16831
16832IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16833{
16834 RT_NOREF(puSrc1);
16835
16836 pResult->au32[2] = 0;
16837 pResult->au32[3] = 0;
16838 return iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar64[0])
16839 | iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar64[1]);
16840}
16841#endif
16842
16843
16844/**
16845 * CVTPS2PD
16846 */
16847#ifdef IEM_WITHOUT_ASSEMBLY
16848static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16849{
16850 RTFLOAT32U r32Src1;
16851 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16852
16853 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16854 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16855 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16856}
16857
16858
16859IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16860{
16861 RT_NOREF(puSrc1);
16862
16863 return iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar32[0])
16864 | iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar32[1]);
16865}
16866#endif
16867
16868
16869/**
16870 * CVTDQ2PS
16871 */
16872#ifdef IEM_WITHOUT_ASSEMBLY
16873static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16874{
16875 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16876 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16877 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16878}
16879
16880
16881IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16882{
16883 RT_NOREF(puSrc1);
16884
16885 return iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, puSrc2->ai32[0])
16886 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, puSrc2->ai32[1])
16887 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[2], uMxCsrIn, puSrc2->ai32[2])
16888 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[3], uMxCsrIn, puSrc2->ai32[3]);
16889}
16890#endif
16891
16892
16893/**
16894 * CVTPS2DQ
16895 */
16896#ifdef IEM_WITHOUT_ASSEMBLY
16897static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16898{
16899 RTFLOAT32U r32Src;
16900 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16901
16902 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16903 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16904 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16905}
16906
16907
16908IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16909{
16910 RT_NOREF(puSrc1);
16911
16912 return iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16913 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16914 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16915 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16916}
16917#endif
16918
16919
16920/**
16921 * CVTTPS2DQ
16922 */
16923#ifdef IEM_WITHOUT_ASSEMBLY
16924static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16925{
16926 RTFLOAT32U r32Src;
16927 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16928
16929 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16930 SoftState.roundingMode = softfloat_round_minMag;
16931 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16932 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16933}
16934
16935
16936IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16937{
16938 RT_NOREF(puSrc1);
16939
16940 return iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16941 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16942 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16943 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16944}
16945#endif
16946
16947
16948/**
16949 * CVTTPD2DQ
16950 */
16951#ifdef IEM_WITHOUT_ASSEMBLY
16952static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16953{
16954 RTFLOAT64U r64Src;
16955 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16956
16957 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16958 SoftState.roundingMode = softfloat_round_minMag;
16959 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16960 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16961}
16962
16963
16964IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16965{
16966 RT_NOREF(puSrc1);
16967
16968 pResult->au64[1] = 0;
16969 return iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16970 | iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16971}
16972#endif
16973
16974
16975/**
16976 * CVTDQ2PD
16977 */
16978#ifdef IEM_WITHOUT_ASSEMBLY
16979static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16980{
16981 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16982 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16983 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16984}
16985
16986
16987IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16988{
16989 RT_NOREF(puSrc1);
16990
16991 return iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, puSrc2->ai32[0])
16992 | iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, puSrc2->ai32[1]);
16993}
16994#endif
16995
16996
16997/**
16998 * CVTPD2DQ
16999 */
17000#ifdef IEM_WITHOUT_ASSEMBLY
17001static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
17002{
17003 RTFLOAT64U r64Src;
17004 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
17005
17006 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17007 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17008 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17009}
17010
17011
17012IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17013{
17014 RT_NOREF(puSrc1);
17015
17016 pResult->au64[1] = 0;
17017 return iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
17018 | iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
17019}
17020#endif
17021
17022
17023/**
17024 * [V]SHUFPS
17025 */
17026#ifdef IEM_WITHOUT_ASSEMBLY
17027IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17028{
17029 RTUINT128U const uSrc1 = *puDst;
17030 RTUINT128U const uSrc2 = *puSrc;
17031 ASMCompilerBarrier();
17032 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17033 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17034 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17035 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17036}
17037#endif
17038
17039
17040IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17041{
17042 RTUINT128U const uSrc1 = *puSrc1;
17043 RTUINT128U const uSrc2 = *puSrc2;
17044 ASMCompilerBarrier();
17045 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17046 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17047 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17048 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17049}
17050
17051
17052IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17053{
17054 RTUINT256U const uSrc1 = *puSrc1;
17055 RTUINT256U const uSrc2 = *puSrc2;
17056 ASMCompilerBarrier();
17057 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17058 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17059 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17060 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17061
17062 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
17063 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
17064 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
17065 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
17066}
17067
17068
17069/**
17070 * [V]SHUFPD
17071 */
17072#ifdef IEM_WITHOUT_ASSEMBLY
17073IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17074{
17075 RTUINT128U const uSrc1 = *puDst;
17076 RTUINT128U const uSrc2 = *puSrc;
17077 ASMCompilerBarrier();
17078 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17079 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17080}
17081#endif
17082
17083
17084IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17085{
17086 RTUINT128U const uSrc1 = *puSrc1;
17087 RTUINT128U const uSrc2 = *puSrc2;
17088 ASMCompilerBarrier();
17089 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17090 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17091}
17092
17093
17094IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17095{
17096 RTUINT256U const uSrc1 = *puSrc1;
17097 RTUINT256U const uSrc2 = *puSrc2;
17098 ASMCompilerBarrier();
17099 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17100 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17101 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
17102 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
17103}
17104
17105
17106/*
17107 * PHMINPOSUW / VPHMINPOSUW
17108 */
17109IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17110{
17111 uint16_t u16Min = puSrc->au16[0];
17112 uint8_t idxMin = 0;
17113
17114 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
17115 if (puSrc->au16[i] < u16Min)
17116 {
17117 u16Min = puSrc->au16[i];
17118 idxMin = i;
17119 }
17120
17121 puDst->au64[0] = 0;
17122 puDst->au64[1] = 0;
17123 puDst->au16[0] = u16Min;
17124 puDst->au16[1] = idxMin;
17125}
17126
17127
17128IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17129{
17130 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
17131}
17132
17133
17134/**
17135 * VPERMILPS
17136 */
17137#ifdef IEM_WITHOUT_ASSEMBLY
17138IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17139{
17140 RTUINT128U const uSrc = *puSrc;
17141 ASMCompilerBarrier();
17142
17143 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17144 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17145 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17146 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17147}
17148
17149
17150IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17151{
17152 RTUINT256U const uSrc = *puSrc;
17153 ASMCompilerBarrier();
17154
17155 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17156 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17157 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17158 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17159
17160 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17161 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17162 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17163 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17164}
17165
17166IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17167{
17168 RTUINT128U const uSrc1 = *puSrc1;
17169 RTUINT128U const uSrc2 = *puSrc2;
17170 ASMCompilerBarrier();
17171
17172 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17173 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17174 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17175 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17176}
17177
17178IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17179{
17180 RTUINT256U const uSrc1 = *puSrc1;
17181 RTUINT256U const uSrc2 = *puSrc2;
17182 ASMCompilerBarrier();
17183
17184 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17185 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17186 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17187 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17188
17189 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17190 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17191 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17192 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17193}
17194#endif
17195
17196
17197IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17198{
17199 RTUINT128U const uSrc = *puSrc;
17200 ASMCompilerBarrier();
17201
17202 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17203 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17204 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17205 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17206}
17207
17208
17209IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17210{
17211 RTUINT256U const uSrc = *puSrc;
17212 ASMCompilerBarrier();
17213
17214 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17215 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17216 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17217 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17218
17219 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17220 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17221 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17222 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17223}
17224
17225IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17226{
17227 RTUINT128U const uSrc1 = *puSrc1;
17228 RTUINT128U const uSrc2 = *puSrc2;
17229 ASMCompilerBarrier();
17230
17231 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17232 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17233 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17234 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17235}
17236
17237IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17238{
17239 RTUINT256U const uSrc1 = *puSrc1;
17240 RTUINT256U const uSrc2 = *puSrc2;
17241 ASMCompilerBarrier();
17242
17243 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17244 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17245 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17246 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17247
17248 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17249 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17250 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17251 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17252}
17253
17254
17255/**
17256 * VPERMILPD
17257 */
17258#ifdef IEM_WITHOUT_ASSEMBLY
17259IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17260{
17261 RTUINT128U const uSrc = *puSrc;
17262 ASMCompilerBarrier();
17263
17264 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17265 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17266}
17267
17268
17269IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17270{
17271 RTUINT256U const uSrc = *puSrc;
17272 ASMCompilerBarrier();
17273
17274 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17275 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17276
17277 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17278 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17279}
17280
17281IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17282{
17283 RTUINT128U const uSrc1 = *puSrc1;
17284 RTUINT128U const uSrc2 = *puSrc2;
17285 ASMCompilerBarrier();
17286
17287 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17288 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17289}
17290
17291IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17292{
17293 RTUINT256U const uSrc1 = *puSrc1;
17294 RTUINT256U const uSrc2 = *puSrc2;
17295 ASMCompilerBarrier();
17296
17297 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17298 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17299
17300 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17301 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17302}
17303#endif
17304
17305
17306IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17307{
17308 RTUINT128U const uSrc = *puSrc;
17309 ASMCompilerBarrier();
17310
17311 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17312 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17313}
17314
17315
17316IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17317{
17318 RTUINT256U const uSrc = *puSrc;
17319 ASMCompilerBarrier();
17320
17321 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17322 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17323
17324 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17325 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17326}
17327
17328IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17329{
17330 RTUINT128U const uSrc1 = *puSrc1;
17331 RTUINT128U const uSrc2 = *puSrc2;
17332 ASMCompilerBarrier();
17333
17334 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17335 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17336}
17337
17338IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17339{
17340 RTUINT256U const uSrc1 = *puSrc1;
17341 RTUINT256U const uSrc2 = *puSrc2;
17342 ASMCompilerBarrier();
17343
17344 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17345 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17346
17347 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17348 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17349}
17350
17351
17352/*
17353 * [V]PBLENDVB
17354 */
17355IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17356{
17357 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17358 if (puMask->au8[i] & RT_BIT(7))
17359 puDst->au8[i] = puSrc->au8[i];
17360}
17361
17362
17363IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17364{
17365 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17366 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17367}
17368
17369
17370IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17371{
17372 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17373 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17374}
17375
17376
17377/*
17378 * [V]BLENDVPS
17379 */
17380IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17381{
17382 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17383 if (puMask->au32[i] & RT_BIT_32(31))
17384 puDst->au32[i] = puSrc->au32[i];
17385}
17386
17387
17388IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17389{
17390 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17391 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17392}
17393
17394
17395IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17396{
17397 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17398 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17399}
17400
17401
17402/*
17403 * [V]BLENDVPD
17404 */
17405IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17406{
17407 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17408 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17409}
17410
17411
17412IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17413{
17414 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17415 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17416}
17417
17418
17419IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17420{
17421 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17422 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17423}
17424
17425
17426/**
17427 * [V]PALIGNR
17428 */
17429IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17430{
17431 uint64_t const u64Src1 = *pu64Dst;
17432 ASMCompilerBarrier();
17433
17434 if (bEvil >= 16)
17435 *pu64Dst = 0;
17436 else if (bEvil >= 8)
17437 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17438 else
17439 {
17440 uint8_t cShift = bEvil * 8;
17441 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17442 | (u64Src2 >> cShift);
17443 }
17444}
17445
17446
17447IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17448{
17449 RTUINT128U const uSrc1 = *puDst;
17450 RTUINT128U const uSrc2 = *puSrc;
17451 ASMCompilerBarrier();
17452
17453 puDst->au64[0] = 0;
17454 puDst->au64[1] = 0;
17455 if (bEvil >= 32)
17456 { /* Everything stays 0. */ }
17457 else if (bEvil >= 16)
17458 {
17459 bEvil -= 16;
17460 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17461 puDst->au8[i - bEvil] = uSrc1.au8[i];
17462 }
17463 else
17464 {
17465 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17466 puDst->au8[i] = uSrc2.au8[i + bEvil];
17467 for (uint8_t i = 0; i < bEvil; i++)
17468 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17469 }
17470}
17471
17472
17473IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17474{
17475 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17476 RTUINT128U const uSrc2 = *puSrc2;
17477 ASMCompilerBarrier();
17478
17479 puDst->au64[0] = 0;
17480 puDst->au64[1] = 0;
17481 if (bEvil >= 32)
17482 { /* Everything stays 0. */ }
17483 else if (bEvil >= 16)
17484 {
17485 bEvil -= 16;
17486 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17487 puDst->au8[i - bEvil] = uSrc1.au8[i];
17488 }
17489 else
17490 {
17491 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17492 puDst->au8[i] = uSrc2.au8[i + bEvil];
17493 for (uint8_t i = 0; i < bEvil; i++)
17494 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17495 }
17496}
17497
17498
17499IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17500{
17501 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17502 RTUINT256U const uSrc2 = *puSrc2;
17503 ASMCompilerBarrier();
17504
17505 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17506 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17507}
17508
17509
17510/**
17511 * [V]PBLENDW
17512 */
17513IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17514{
17515 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17516 if (bEvil & RT_BIT(i))
17517 puDst->au16[i] = puSrc->au16[i];
17518}
17519
17520
17521IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17522{
17523 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17524 if (bEvil & RT_BIT(i))
17525 puDst->au16[i] = puSrc2->au16[i];
17526 else
17527 puDst->au16[i] = puSrc1->au16[i];
17528}
17529
17530
17531IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17532{
17533 for (uint8_t i = 0; i < 8; i++)
17534 if (bEvil & RT_BIT(i))
17535 {
17536 puDst->au16[ i] = puSrc2->au16[ i];
17537 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17538 }
17539 else
17540 {
17541 puDst->au16[ i] = puSrc1->au16[ i];
17542 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17543 }
17544}
17545
17546
17547/**
17548 * [V]PBLENDD
17549 */
17550IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17551{
17552 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17553 if (bEvil & RT_BIT(i))
17554 puDst->au32[i] = puSrc2->au32[i];
17555 else
17556 puDst->au32[i] = puSrc1->au32[i];
17557}
17558
17559
17560IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17561{
17562 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17563 if (bEvil & RT_BIT(i))
17564 puDst->au32[i] = puSrc2->au32[i];
17565 else
17566 puDst->au32[i] = puSrc1->au32[i];
17567}
17568
17569
17570/**
17571 * [V]BLENDPS
17572 */
17573IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17574{
17575 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17576 if (bEvil & RT_BIT(i))
17577 puDst->au32[i] = puSrc->au32[i];
17578}
17579
17580
17581IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17582{
17583 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17584 if (bEvil & RT_BIT(i))
17585 puDst->au32[i] = puSrc2->au32[i];
17586 else
17587 puDst->au32[i] = puSrc1->au32[i];
17588}
17589
17590
17591IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17592{
17593 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17594 if (bEvil & RT_BIT(i))
17595 puDst->au32[i] = puSrc2->au32[i];
17596 else
17597 puDst->au32[i] = puSrc1->au32[i];
17598}
17599
17600
17601/**
17602 * [V]BLENDPD
17603 */
17604IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17605{
17606 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17607 if (bEvil & RT_BIT(i))
17608 puDst->au64[i] = puSrc->au64[i];
17609}
17610
17611
17612IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17613{
17614 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17615 if (bEvil & RT_BIT(i))
17616 puDst->au64[i] = puSrc2->au64[i];
17617 else
17618 puDst->au64[i] = puSrc1->au64[i];
17619}
17620
17621
17622IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17623{
17624 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17625 if (bEvil & RT_BIT(i))
17626 puDst->au64[i] = puSrc2->au64[i];
17627 else
17628 puDst->au64[i] = puSrc1->au64[i];
17629}
17630
17631
17632/**
17633 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17634 */
17635
17636static uint8_t iemAImpl_aes_sbox[] = {
17637 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17638 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17639 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17640 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17641 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17642 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17643 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17644 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17645 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17646 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17647 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17648 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17649 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17650 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17651 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17652 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17653};
17654
17655/* The InvS-Box lookup table. */
17656static uint8_t iemAImpl_aes_inv_sbox[] = {
17657 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17658 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17659 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17660 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17661 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17662 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17663 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17664 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17665 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17666 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17667 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17668 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17669 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17670 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17671 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17672 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17673};
17674
17675/* The ShiftRows lookup table. */
17676static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17677 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17678};
17679
17680/* The InvShiftRows lookup table. */
17681static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17682 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17683};
17684
17685static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17686{
17687 RTUINT128U uVal;
17688 int i;
17689
17690 for (i = 0; i < 16; ++i)
17691 uVal.au8[i] = abSubst[puSrc->au8[i]];
17692
17693 return uVal;
17694}
17695
17696static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17697{
17698 return (u << 1) ^ (((u >> 7) & 1) * 27);
17699}
17700
17701static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17702{
17703 RTUINT128U uVal;
17704 int i;
17705 uint8_t tmp;
17706
17707 for (i = 0; i < 16; i += 4) {
17708 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17709 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17710 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17711 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17712 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17713 }
17714
17715 return uVal;
17716}
17717
17718static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17719{
17720 RTUINT128U uVal;
17721 int i;
17722
17723 for (i = 0; i < 16; ++i)
17724 uVal.au8[i] = puSrc->au8[abShift[i]];
17725
17726 return uVal;
17727}
17728
17729static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17730{
17731 uint8_t val;
17732
17733 val = ((b >> 0) & 1) * a;
17734 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17735 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17736 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17737 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17738
17739 return val;
17740}
17741
17742static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17743{
17744 RTUINT128U uVal;
17745 int i;
17746
17747 for (i = 0; i < 16; i += 4) {
17748 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17749 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17750 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17751 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17752 }
17753
17754 return uVal;
17755}
17756
17757static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17758{
17759 RTUINT32U uTmp;
17760
17761 uTmp.au32[0] = w;
17762 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17763 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17764 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17765 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17766
17767 return uTmp.au32[0];
17768}
17769
17770static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17771{
17772 return (w << 24) | (w >> 8);
17773}
17774
17775/**
17776 * [V]AESKEYGENASSIST
17777 */
17778IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17779{
17780 RTUINT128U uTmp;
17781 uint32_t uRCon = bImm; /* Round constant. */
17782
17783 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17784 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17785 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17786 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17787
17788 *puDst = uTmp;
17789}
17790
17791
17792/**
17793 * [V]AESIMC
17794 */
17795IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17796{
17797 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17798}
17799
17800
17801/**
17802 * [V]AESENC
17803 */
17804IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17805{
17806 RTUINT128U uTmp;
17807
17808 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17809 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17810 uTmp = iemAImpl_aes_mix_col(&uTmp);
17811 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17812 uTmp.au64[1] ^= puSrc->au64[1];
17813
17814 *puDst = uTmp;
17815}
17816
17817
17818/**
17819 * [V]AESENCLAST
17820 */
17821IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17822{
17823 RTUINT128U uTmp;
17824
17825 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17826 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17827 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17828 uTmp.au64[1] ^= puSrc->au64[1];
17829
17830 *puDst = uTmp;
17831}
17832
17833
17834/**
17835 * [V]AESDEC
17836 */
17837IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17838{
17839 RTUINT128U uTmp;
17840
17841 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17842 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17843 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17844 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17845 uTmp.au64[1] ^= puSrc->au64[1];
17846
17847 *puDst = uTmp;
17848}
17849
17850
17851/**
17852 * [V]AESDECLAST
17853 */
17854IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17855{
17856 RTUINT128U uTmp;
17857
17858 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17859 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17860 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17861 uTmp.au64[1] ^= puSrc->au64[1];
17862
17863 *puDst = uTmp;
17864}
17865
17866
17867/**
17868 * [V]PCMPISTRI
17869 */
17870
17871/**
17872 * Does the comparisons based on the mode and source input format.
17873 */
17874static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17875{
17876#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17877 do \
17878 { \
17879 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17880 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17881 { \
17882 switch (a_bAggOp) \
17883 { \
17884 case 0: \
17885 case 2: \
17886 case 3: \
17887 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17888 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17889 break; \
17890 case 1: \
17891 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17892 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17893 break; \
17894 default: \
17895 AssertReleaseFailed(); \
17896 } \
17897 } \
17898 } while(0)
17899
17900 uint8_t bAggOp = (bImm >> 2) & 0x3;
17901 switch (bImm & 0x3)
17902 {
17903 case 0:
17904 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17905 break;
17906 case 1:
17907 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17908 break;
17909 case 2:
17910 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17911 break;
17912 case 3:
17913 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17914 break;
17915 default:
17916 AssertReleaseFailed();
17917 }
17918#undef PCMPXSTRX_CMP_CASE
17919}
17920
17921static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17922{
17923 if (bImm & 0x1)
17924 {
17925 /* Words -> 8 elements. */
17926 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17927 if (puSrc->au16[i] == 0)
17928 return i;
17929
17930 return 8;
17931 }
17932 else
17933 {
17934 /* Bytes -> 16 elements. */
17935 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17936 if (puSrc->au8[i] == 0)
17937 return i;
17938
17939 return 16;
17940 }
17941}
17942
17943static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17944{
17945 if (bImm & 0x1)
17946 {
17947 if (i64Len > -8 && i64Len < 8)
17948 return RT_ABS(i64Len);
17949
17950 return 8;
17951 }
17952 else
17953 {
17954 if (i64Len > -16 && i64Len < 16)
17955 return RT_ABS(i64Len);
17956
17957 return 16;
17958 }
17959}
17960
17961/**
17962 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17963 */
17964static const bool g_afCmpOverride[4][4] =
17965{
17966 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17967 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17968 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17969 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17970 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17971};
17972
17973DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17974{
17975 if (fSrc1Valid && fSrc2Valid)
17976 return fCmpRes;
17977
17978 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17979 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17980 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17981}
17982
17983static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17984{
17985 uint8_t bAggOp = (bImm >> 2) & 0x3;
17986 uint16_t u16Result = 0;
17987
17988 switch (bAggOp)
17989 {
17990 case 0: /* Equal any */
17991 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17992 {
17993 uint16_t u16Res = 0;
17994 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17995 {
17996 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17997 idxSrc1 < idxLen1,
17998 idxSrc2 < idxLen2,
17999 bAggOp))
18000 {
18001 u16Res = RT_BIT(idxSrc2);
18002 break;
18003 }
18004 }
18005
18006 u16Result |= u16Res;
18007 }
18008 break;
18009
18010 case 1: /* Ranges */
18011 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18012 {
18013 uint16_t u16Res = 0;
18014 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
18015 {
18016 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
18017 idxSrc1 < idxLen1,
18018 idxSrc2 < idxLen2,
18019 bAggOp)
18020 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
18021 (idxSrc1 + 1) < idxLen1,
18022 idxSrc2 < idxLen2,
18023 bAggOp))
18024 {
18025 u16Res = RT_BIT(idxSrc2);
18026 break;
18027 }
18028 }
18029
18030 u16Result |= u16Res;
18031 }
18032 break;
18033
18034 case 2: /* Equal each */
18035 for (uint8_t i = 0; i < cElems; i++)
18036 {
18037 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
18038 i < idxLen1,
18039 i < idxLen2,
18040 bAggOp))
18041 u16Result |= RT_BIT(i);
18042 }
18043 break;
18044
18045 case 3: /* Equal ordered */
18046 u16Result = 0;
18047 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18048 {
18049 uint16_t u16Res = RT_BIT(idxSrc2);
18050 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
18051 {
18052 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
18053 idxSrc1 < idxLen1,
18054 k < idxLen2,
18055 bAggOp))
18056 {
18057 u16Res = 0;
18058 break;
18059 }
18060 }
18061
18062 u16Result |= u16Res;
18063 }
18064 break;
18065 }
18066
18067 /* Polarity selection. */
18068 switch ((bImm >> 4) & 0x3)
18069 {
18070 case 0:
18071 case 2:
18072 /* Nothing to do. */
18073 break;
18074 case 1:
18075 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
18076 break;
18077 case 3:
18078 u16Result ^= RT_BIT(idxLen2) - 1;
18079 break;
18080 default:
18081 AssertReleaseFailed();
18082 }
18083
18084 return u16Result;
18085}
18086
18087DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
18088{
18089 uint32_t fEFlags = 0;
18090
18091 if (u16Result)
18092 fEFlags |= X86_EFL_CF;
18093 if (cLen2 < cElems)
18094 fEFlags |= X86_EFL_ZF;
18095 if (cLen1 < cElems)
18096 fEFlags |= X86_EFL_SF;
18097 if (u16Result & 0x1)
18098 fEFlags |= X86_EFL_OF;
18099 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
18100}
18101
18102DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
18103 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
18104{
18105 bool afCmpRes[16][16];
18106 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18107
18108 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
18109 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
18110 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
18111
18112 return u16Result;
18113}
18114
18115DECL_FORCE_INLINE(uint32_t) iemAImpl_pcmpxstri_set_result_index(uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18116{
18117 uint32_t u32Ecx;
18118 if (bImm & RT_BIT(6))
18119 {
18120 /* Index for MSB set. */
18121 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
18122 if (idxMsb)
18123 u32Ecx = idxMsb - 1;
18124 else
18125 u32Ecx = cElems;
18126 }
18127 else
18128 {
18129 /* Index for LSB set. */
18130 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
18131 if (idxLsb)
18132 u32Ecx = idxLsb - 1;
18133 else
18134 u32Ecx = cElems;
18135 }
18136
18137 return u32Ecx;
18138}
18139
18140IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pEFlags, PCRTUINT128U pSrc1, PCRTUINT128U pSrc2, uint8_t bEvil))
18141{
18142 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18143 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(pSrc1, bEvil);
18144 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(pSrc2, bEvil);
18145
18146 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, pSrc1, pSrc2, cLen1, cLen2, bEvil);
18147 return iemAImpl_pcmpxstri_set_result_index(u16Result, cElems, bEvil);
18148}
18149
18150
18151/**
18152 * [V]PCMPESTRI
18153 */
18154IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18155{
18156 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18157 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18158 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18159
18160 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18161 *pu32Ecx = iemAImpl_pcmpxstri_set_result_index(u16Result, cElems, bEvil);
18162}
18163
18164
18165/**
18166 * [V]PCMPISTRM
18167 */
18168DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18169{
18170 if (bImm & RT_BIT(6))
18171 {
18172 /* Generate a mask. */
18173 if (cElems == 8)
18174 {
18175 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18176 if (u16Result & RT_BIT(i))
18177 puDst->au16[i] = 0xffff;
18178 else
18179 puDst->au16[i] = 0;
18180 }
18181 else
18182 {
18183 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18184 if (u16Result & RT_BIT(i))
18185 puDst->au8[i] = 0xff;
18186 else
18187 puDst->au8[i] = 0;
18188 }
18189 }
18190 else
18191 {
18192 /* Store the result. */
18193 puDst->au64[0] = u16Result;
18194 puDst->au64[1] = 0;
18195 }
18196}
18197
18198IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18199{
18200 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18201 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18202 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18203
18204 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18205 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18206}
18207
18208
18209/**
18210 * [V]PCMPESTRM
18211 */
18212IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18213{
18214 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18215 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18216 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18217
18218 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18219 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18220}
18221
18222
18223/*
18224 * [V]PCLMULQDQ
18225 */
18226IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18227{
18228 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18229}
18230
18231
18232IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18233{
18234 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18235 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18236
18237 puDst->au64[0] = 0;
18238 puDst->au64[1] = 0;
18239
18240 /*
18241 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18242 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18243 * and squeeze out some optimizations.
18244 */
18245 if (uSrc1 & 0x1)
18246 puDst->au64[0] = uSrc2;
18247
18248 uSrc1 >>= 1;
18249
18250 uint8_t iDigit = 1;
18251 while (uSrc1)
18252 {
18253 if (uSrc1 & 0x1)
18254 {
18255 puDst->au64[0] ^= (uSrc2 << iDigit);
18256 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18257 }
18258
18259 uSrc1 >>= 1;
18260 iDigit++;
18261 }
18262}
18263
18264
18265/**
18266 * [V]MOVMSKPS
18267 */
18268#ifdef IEM_WITHOUT_ASSEMBLY
18269IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18270{
18271 *pu8Dst = puSrc->au32[0] >> 31;
18272 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18273 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18274 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18275}
18276
18277#endif
18278
18279IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18280{
18281 *pu8Dst = puSrc->au32[0] >> 31;
18282 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18283 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18284 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18285}
18286
18287
18288IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18289{
18290 *pu8Dst = puSrc->au32[0] >> 31;
18291 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18292 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18293 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18294 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18295 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18296 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18297 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18298}
18299
18300
18301/**
18302 * [V]MOVMSKPD
18303 */
18304#ifdef IEM_WITHOUT_ASSEMBLY
18305IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18306{
18307 *pu8Dst = puSrc->au64[0] >> 63;
18308 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18309}
18310
18311#endif
18312
18313IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18314{
18315 *pu8Dst = puSrc->au64[0] >> 63;
18316 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18317}
18318
18319
18320IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18321{
18322 *pu8Dst = puSrc->au64[0] >> 63;
18323 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18324 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18325 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18326}
18327
18328
18329/**
18330 * CVTTSD2SI
18331 */
18332#ifdef IEM_WITHOUT_ASSEMBLY
18333IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18334{
18335 RTFLOAT64U r64Src;
18336
18337 r64Src.u = *pu64Src;
18338 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18339
18340 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18341 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18342 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18343}
18344
18345
18346IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18347{
18348 RTFLOAT64U r64Src;
18349
18350 r64Src.u = *pu64Src;
18351 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18352
18353 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18354 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18355 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18356}
18357#endif
18358
18359
18360/**
18361 * CVTSD2SI
18362 */
18363#ifdef IEM_WITHOUT_ASSEMBLY
18364IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18365{
18366 RTFLOAT64U r64Src;
18367
18368 r64Src.u = *pu64Src;
18369 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18370
18371 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18372 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18373 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18374}
18375
18376
18377IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18378{
18379 RTFLOAT64U r64Src;
18380
18381 r64Src.u = *pu64Src;
18382 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18383
18384 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18385 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18386 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18387}
18388#endif
18389
18390
18391/**
18392 * CVTTSS2SI
18393 */
18394#ifdef IEM_WITHOUT_ASSEMBLY
18395IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18396{
18397 RTFLOAT32U r32Src;
18398
18399 r32Src.u = *pu32Src;
18400 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18401
18402 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18403 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18404 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18405}
18406
18407
18408IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18409{
18410 RTFLOAT32U r32Src;
18411
18412 r32Src.u = *pu32Src;
18413 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18414
18415 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18416 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18417 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18418}
18419#endif
18420
18421
18422/**
18423 * CVTSS2SI
18424 */
18425#ifdef IEM_WITHOUT_ASSEMBLY
18426IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18427{
18428 RTFLOAT32U r32Src;
18429
18430 r32Src.u = *pu32Src;
18431 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18432
18433 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18434 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18435 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18436}
18437
18438
18439IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18440{
18441 RTFLOAT32U r32Src;
18442
18443 r32Src.u = *pu32Src;
18444 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18445
18446 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18447 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18448 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18449}
18450#endif
18451
18452
18453/**
18454 * CVTSI2SD
18455 */
18456#ifdef IEM_WITHOUT_ASSEMBLY
18457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i32,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18458{
18459 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18460 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18461 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18462}
18463
18464
18465IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i64,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18466{
18467 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18468 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18469 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18470}
18471#endif
18472
18473
18474/**
18475 * CVTSI2SS
18476 */
18477#ifdef IEM_WITHOUT_ASSEMBLY
18478IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i32,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18479{
18480 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18481 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18482 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18483}
18484
18485
18486IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i64,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18487{
18488 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18489 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18490 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18491}
18492#endif
18493
18494
18495/**
18496 * [V]UCOMISS
18497 */
18498#ifdef IEM_WITHOUT_ASSEMBLY
18499IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18500{
18501 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18502
18503 if (RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2))
18504 {
18505 uMxCsrIn |= X86_MXCSR_IE;
18506 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18507 }
18508 else if (RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18509 {
18510 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18511 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18512 }
18513 else
18514 {
18515 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18516
18517 RTFLOAT32U r32Src1, r32Src2;
18518 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1);
18519 fDe |= iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18520
18521 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18522 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18523 if (f32_eq(f32Src1, f32Src2, &SoftState))
18524 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18525 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18526 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18527 /* else: GREATER_THAN 000 */
18528
18529 uMxCsrIn |= fDe;
18530 }
18531
18532 *pfEFlags = fEFlagsNew;
18533 return uMxCsrIn;
18534}
18535#endif
18536
18537IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18538{
18539 return iemAImpl_ucomiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18540}
18541
18542
18543/**
18544 * [V]UCOMISD
18545 */
18546#ifdef IEM_WITHOUT_ASSEMBLY
18547IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18548{
18549 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18550
18551 if (RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2))
18552 {
18553 uMxCsrIn |= X86_MXCSR_IE;
18554 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18555 }
18556 else if (RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18557 {
18558 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18559 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18560 }
18561 else
18562 {
18563 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18564
18565 RTFLOAT64U r64Src1, r64Src2;
18566 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1)
18567 | iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18568
18569 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18570 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18571 if (f64_eq(f64Src1, f64Src2, &SoftState))
18572 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18573 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18574 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18575 /* else: GREATER_THAN 000 */
18576
18577 uMxCsrIn |= fDe;
18578 }
18579
18580 *pfEFlags = fEFlagsNew;
18581 return uMxCsrIn;
18582}
18583#endif
18584
18585IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18586{
18587 return iemAImpl_ucomisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18588}
18589
18590
18591/**
18592 * [V]COMISS
18593 */
18594#ifdef IEM_WITHOUT_ASSEMBLY
18595IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18596{
18597 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18598
18599 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2)
18600 || RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18601 {
18602 uMxCsrIn |= X86_MXCSR_IE;
18603 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18604 }
18605 else
18606 {
18607 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18608
18609 RTFLOAT32U r32Src1, r32Src2;
18610 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1)
18611 | iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18612
18613 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18614 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18615 if (f32_eq(f32Src1, f32Src2, &SoftState))
18616 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18617 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18618 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18619 /* else: GREATER_THAN 000 */
18620
18621 uMxCsrIn |= fDe;
18622 }
18623
18624 *pfEFlags = fEFlagsNew;
18625 return uMxCsrIn;
18626}
18627#endif
18628
18629
18630IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18631{
18632 return iemAImpl_comiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18633}
18634
18635
18636/**
18637 * [V]COMISD
18638 */
18639#ifdef IEM_WITHOUT_ASSEMBLY
18640IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18641{
18642 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18643
18644 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2)
18645 || RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18646 {
18647 uMxCsrIn |= X86_MXCSR_IE;
18648 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18649 }
18650 else
18651 {
18652 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18653
18654 RTFLOAT64U r64Src1, r64Src2;
18655 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1);
18656 fDe |= iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18657
18658 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18659 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18660 if (f64_eq(f64Src1, f64Src2, &SoftState))
18661 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18662 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18663 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18664 /* else: GREATER_THAN 000 */
18665
18666 uMxCsrIn |= fDe;
18667 }
18668
18669 *pfEFlags = fEFlagsNew;
18670 return uMxCsrIn;
18671}
18672#endif
18673
18674IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18675{
18676 return iemAImpl_comisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18677}
18678
18679
18680/**
18681 * CMPPS / CMPPD / CMPSS / CMPSD
18682 */
18683#ifdef IEM_WITHOUT_ASSEMBLY
18684/**
18685 * A compare truth table entry.
18686 */
18687typedef struct CMPTRUTHTBLENTRY
18688{
18689 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18690 bool fSignalsOnQNan;
18691 /** The boolean result when the input operands are unordered. */
18692 bool fUnordered;
18693 /** The boolean result when A = B. */
18694 bool fEqual;
18695 /** The boolean result when A < B. */
18696 bool fLowerThan;
18697 /** The boolean result when A > B. */
18698 bool fGreaterThan;
18699} CMPTRUTHTBLENTRY;
18700/** Pointer to a const truth table entry. */
18701typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18702
18703
18704/** The compare truth table (indexed by immediate). */
18705static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18706{
18707 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18708 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18709 /* 01H (LT_OS) */ { true, false, false, true, false },
18710 /* 02H (LE_OS) */ { true, false, true, true, false },
18711 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18712 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18713 /* 05H (NLT_US) */ { true, true, true, false, true },
18714 /* 06H (NLE_US) */ { true, true, false, false, true },
18715 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18716 /** @todo AVX variants. */
18717};
18718
18719
18720static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18721{
18722 bool fRes;
18723 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18724
18725 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18726 {
18727 *pfMxcsr |= X86_MXCSR_IE;
18728 fRes = g_aCmpTbl[bEvil].fUnordered;
18729 }
18730 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18731 {
18732 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18733 *pfMxcsr |= X86_MXCSR_IE;
18734 fRes = g_aCmpTbl[bEvil].fUnordered;
18735 }
18736 else
18737 {
18738 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18739
18740 RTFLOAT32U r32Src1, r32Src2;
18741 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18742 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18743
18744 *pfMxcsr |= fDe;
18745 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18746 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18747 if (f32_eq(f32Src1, f32Src2, &SoftState))
18748 fRes = g_aCmpTbl[bEvil].fEqual;
18749 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18750 fRes = g_aCmpTbl[bEvil].fLowerThan;
18751 else
18752 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18753 }
18754
18755 return fRes;
18756}
18757
18758
18759static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18760{
18761 bool fRes;
18762 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18763
18764 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18765 {
18766 *pfMxcsr |= X86_MXCSR_IE;
18767 fRes = g_aCmpTbl[bEvil].fUnordered;
18768 }
18769 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18770 {
18771 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18772 *pfMxcsr |= X86_MXCSR_IE;
18773 fRes = g_aCmpTbl[bEvil].fUnordered;
18774 }
18775 else
18776 {
18777 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18778
18779 RTFLOAT64U r64Src1, r64Src2;
18780 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18781 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18782
18783 *pfMxcsr |= fDe;
18784 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18785 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18786 if (f64_eq(f64Src1, f64Src2, &SoftState))
18787 fRes = g_aCmpTbl[bEvil].fEqual;
18788 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18789 fRes = g_aCmpTbl[bEvil].fLowerThan;
18790 else
18791 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18792 }
18793
18794 return fRes;
18795}
18796
18797
18798IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpps_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18799{
18800 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18801 {
18802 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18803 puDst->au32[i] = UINT32_MAX;
18804 else
18805 puDst->au32[i] = 0;
18806 }
18807
18808 return uMxCsrIn;
18809}
18810
18811
18812IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmppd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18813{
18814 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18815 {
18816 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18817 puDst->au64[i] = UINT64_MAX;
18818 else
18819 puDst->au64[i] = 0;
18820 }
18821
18822 return uMxCsrIn;
18823}
18824
18825
18826IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18827{
18828 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18829 puDst->au32[0] = UINT32_MAX;
18830 else
18831 puDst->au32[0] = 0;
18832
18833 puDst->au32[1] = pSrc->uSrc1.au32[1];
18834 puDst->au64[1] = pSrc->uSrc1.au64[1];
18835 return uMxCsrIn;
18836}
18837
18838
18839IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18840{
18841 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18842 puDst->au64[0] = UINT64_MAX;
18843 else
18844 puDst->au64[0] = 0;
18845
18846 puDst->au64[1] = pSrc->uSrc1.au64[1];
18847 return uMxCsrIn;
18848}
18849#endif
18850
18851
18852/**
18853 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18854 */
18855
18856#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18857#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18858#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18859
18860#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18861
18862DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18863{
18864 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18865 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18866
18867 fMxcsr &= ~X86_MXCSR_RC_MASK;
18868 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18869 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18870}
18871
18872static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18873{
18874 RTFLOAT32U r32Src, r32Dst;
18875 float32_t f32Src;
18876 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18877 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18878
18879 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18880 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18881
18882 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18883 return r32Dst;
18884}
18885
18886static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18887{
18888 RTFLOAT64U r64Src, r64Dst;
18889 float64_t f64Src;
18890 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18891 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18892
18893 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18894 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18895
18896 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18897 return r64Dst;
18898}
18899
18900#ifdef IEM_WITHOUT_ASSEMBLY
18901IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18902{
18903 puDst->ar32[0] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18904 puDst->au32[1] = pSrc->uSrc1.au32[1];
18905 puDst->au64[1] = pSrc->uSrc1.au64[1];
18906 return uMxCsrIn;
18907}
18908
18909
18910IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18911{
18912 puDst->ar64[0] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18913 puDst->au64[1] = pSrc->uSrc1.au64[1];
18914 return uMxCsrIn;
18915}
18916#endif
18917
18918IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18919{
18920 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18921 {
18922 puDst->ar32[i] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18923 }
18924
18925 return uMxCsrIn;
18926}
18927
18928
18929IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundpd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18930{
18931 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18932 {
18933 puDst->ar64[i] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18934 }
18935
18936 return uMxCsrIn;
18937}
18938
18939/**
18940 * CVTPD2PI
18941 */
18942#ifdef IEM_WITHOUT_ASSEMBLY
18943static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18944{
18945 RTFLOAT64U r64Src;
18946 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18947
18948 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18949 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18950 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18951}
18952
18953
18954IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18955{
18956 RTUINT64U u64Res;
18957 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18958 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18959
18960 *pu64Dst = u64Res.u;
18961 return fMxcsrOut;
18962}
18963#endif
18964
18965
18966/**
18967 * CVTTPD2PI
18968 */
18969#ifdef IEM_WITHOUT_ASSEMBLY
18970static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18971{
18972 RTFLOAT64U r64Src;
18973 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18974
18975 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18976 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18977 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18978}
18979
18980
18981IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18982{
18983 RTUINT64U u64Res;
18984 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18985 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18986
18987 *pu64Dst = u64Res.u;
18988 return fMxcsrOut;
18989}
18990#endif
18991
18992
18993/**
18994 * CVTPI2PS
18995 */
18996#ifdef IEM_WITHOUT_ASSEMBLY
18997static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18998{
18999 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19000 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
19001 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
19002}
19003
19004
19005IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2ps_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
19006{
19007 RTUINT64U uSrc = { u64Src };
19008 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[0], uSrc.ai32[0]);
19009 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[1], uSrc.ai32[1]);
19010 return fMxcsrOut;
19011}
19012#endif
19013
19014
19015/**
19016 * CVTPI2PD
19017 */
19018#ifdef IEM_WITHOUT_ASSEMBLY
19019static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
19020{
19021 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19022 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
19023 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
19024}
19025
19026
19027IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2pd_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
19028{
19029 RTUINT64U uSrc = { u64Src };
19030 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[0], uSrc.ai32[0]);
19031 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[1], uSrc.ai32[1]);
19032 return fMxcsrOut;
19033}
19034#endif
19035
19036
19037/**
19038 * CVTPS2PI
19039 */
19040#ifdef IEM_WITHOUT_ASSEMBLY
19041static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19042{
19043 RTFLOAT32U r32Src;
19044 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19045
19046 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19047 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
19048 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19049}
19050
19051
19052IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
19053{
19054 RTUINT64U uDst;
19055 RTUINT64U uSrc = { u64Src };
19056 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19057 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19058 *pu64Dst = uDst.u;
19059 return fMxcsrOut;
19060}
19061#endif
19062
19063
19064/**
19065 * CVTTPS2PI
19066 */
19067#ifdef IEM_WITHOUT_ASSEMBLY
19068static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19069{
19070 RTFLOAT32U r32Src;
19071 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19072
19073 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19074 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
19075 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19076}
19077
19078
19079IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
19080{
19081 RTUINT64U uDst;
19082 RTUINT64U uSrc = { u64Src };
19083 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19084 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19085 *pu64Dst = uDst.u;
19086 return fMxcsrOut;
19087}
19088#endif
19089
19090/**
19091 * RDRAND
19092 */
19093IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19094{
19095 *puDst = 0;
19096 *pEFlags &= ~X86_EFL_STATUS_BITS;
19097 *pEFlags |= X86_EFL_CF;
19098}
19099
19100IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19101{
19102 *puDst = 0;
19103 *pEFlags &= ~X86_EFL_STATUS_BITS;
19104 *pEFlags |= X86_EFL_CF;
19105}
19106
19107IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19108{
19109 *puDst = 0;
19110 *pEFlags &= ~X86_EFL_STATUS_BITS;
19111 *pEFlags |= X86_EFL_CF;
19112}
19113
19114/**
19115 * RDSEED
19116 */
19117IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19118{
19119 *puDst = 0;
19120 *pEFlags &= ~X86_EFL_STATUS_BITS;
19121 *pEFlags |= X86_EFL_CF;
19122}
19123
19124IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19125{
19126 *puDst = 0;
19127 *pEFlags &= ~X86_EFL_STATUS_BITS;
19128 *pEFlags |= X86_EFL_CF;
19129}
19130
19131IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19132{
19133 *puDst = 0;
19134 *pEFlags &= ~X86_EFL_STATUS_BITS;
19135 *pEFlags |= X86_EFL_CF;
19136}
19137
19138
19139/**
19140 * SHA1NEXTE
19141 */
19142IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19143{
19144 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19145
19146 puDst->au32[0] = puSrc->au32[0];
19147 puDst->au32[1] = puSrc->au32[1];
19148 puDst->au32[2] = puSrc->au32[2];
19149 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19150}
19151
19152/**
19153 * SHA1MSG1
19154 */
19155IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19156{
19157 uint32_t u32W0 = puDst->au32[3];
19158 uint32_t u32W1 = puDst->au32[2];
19159 uint32_t u32W2 = puDst->au32[1];
19160 uint32_t u32W3 = puDst->au32[0];
19161 uint32_t u32W4 = puSrc->au32[3];
19162 uint32_t u32W5 = puSrc->au32[2];
19163
19164 puDst->au32[3] = u32W2 ^ u32W0;
19165 puDst->au32[2] = u32W3 ^ u32W1;
19166 puDst->au32[1] = u32W4 ^ u32W2;
19167 puDst->au32[0] = u32W5 ^ u32W3;
19168}
19169
19170/**
19171 * SHA1MSG2
19172 */
19173IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19174{
19175 uint32_t u32W13 = puSrc->au32[2];
19176 uint32_t u32W14 = puSrc->au32[1];
19177 uint32_t u32W15 = puSrc->au32[0];
19178 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19179 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19180 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19181 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19182
19183 puDst->au32[3] = u32W16;
19184 puDst->au32[2] = u32W17;
19185 puDst->au32[1] = u32W18;
19186 puDst->au32[0] = u32W19;
19187}
19188
19189/**
19190 * SHA1RNDS4
19191 */
19192typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19193typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19194
19195static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19196{
19197 return (u32B & u32C) ^ (~u32B & u32D);
19198}
19199
19200static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19201{
19202 return u32B ^ u32C ^ u32D;
19203}
19204
19205static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19206{
19207 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19208}
19209
19210static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19211{
19212 return u32B ^ u32C ^ u32D;
19213}
19214
19215IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19216{
19217 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19218 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19219
19220 uint32_t au32A[5];
19221 uint32_t au32B[5];
19222 uint32_t au32C[5];
19223 uint32_t au32D[5];
19224 uint32_t au32E[5];
19225 uint32_t au32W[4];
19226 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19227 uint32_t u32K = s_au32K[bEvil & 0x3];
19228
19229 au32A[0] = puDst->au32[3];
19230 au32B[0] = puDst->au32[2];
19231 au32C[0] = puDst->au32[1];
19232 au32D[0] = puDst->au32[0];
19233 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19234 au32W[i] = puSrc->au32[3 - i];
19235
19236 /* Round 0 is a bit different than the other rounds. */
19237 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19238 au32B[1] = au32A[0];
19239 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19240 au32D[1] = au32C[0];
19241 au32E[1] = au32D[0];
19242
19243 for (uint32_t i = 1; i <= 3; i++)
19244 {
19245 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19246 au32B[i + 1] = au32A[i];
19247 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19248 au32D[i + 1] = au32C[i];
19249 au32E[i + 1] = au32D[i];
19250 }
19251
19252 puDst->au32[3] = au32A[4];
19253 puDst->au32[2] = au32B[4];
19254 puDst->au32[1] = au32C[4];
19255 puDst->au32[0] = au32D[4];
19256}
19257
19258
19259/**
19260 * SHA256MSG1
19261 */
19262DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19263{
19264 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19265}
19266
19267IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19268{
19269 uint32_t u32W4 = puSrc->au32[0];
19270 uint32_t u32W3 = puDst->au32[3];
19271 uint32_t u32W2 = puDst->au32[2];
19272 uint32_t u32W1 = puDst->au32[1];
19273 uint32_t u32W0 = puDst->au32[0];
19274
19275 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19276 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19277 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19278 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19279}
19280
19281/**
19282 * SHA256MSG2
19283 */
19284DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19285{
19286 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19287}
19288
19289IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19290{
19291 uint32_t u32W14 = puSrc->au32[2];
19292 uint32_t u32W15 = puSrc->au32[3];
19293 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19294 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19295 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19296 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19297
19298 puDst->au32[3] = u32W19;
19299 puDst->au32[2] = u32W18;
19300 puDst->au32[1] = u32W17;
19301 puDst->au32[0] = u32W16;
19302}
19303
19304/**
19305 * SHA256RNDS2
19306 */
19307DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19308{
19309 return (u32X & u32Y) ^ (~u32X & u32Z);
19310}
19311
19312DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19313{
19314 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19315}
19316
19317DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19318{
19319 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19320}
19321
19322DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19323{
19324 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19325}
19326
19327IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19328{
19329 uint32_t au32A[3];
19330 uint32_t au32B[3];
19331 uint32_t au32C[3];
19332 uint32_t au32D[3];
19333 uint32_t au32E[3];
19334 uint32_t au32F[3];
19335 uint32_t au32G[3];
19336 uint32_t au32H[3];
19337 uint32_t au32WK[2];
19338
19339 au32A[0] = puSrc->au32[3];
19340 au32B[0] = puSrc->au32[2];
19341 au32C[0] = puDst->au32[3];
19342 au32D[0] = puDst->au32[2];
19343 au32E[0] = puSrc->au32[1];
19344 au32F[0] = puSrc->au32[0];
19345 au32G[0] = puDst->au32[1];
19346 au32H[0] = puDst->au32[0];
19347
19348 au32WK[0] = puXmm0Constants->au32[0];
19349 au32WK[1] = puXmm0Constants->au32[1];
19350
19351 for (uint32_t i = 0; i < 2; i++)
19352 {
19353 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19354 + iemAImpl_sha256_upper_sigma1(au32E[i])
19355 + au32WK[i]
19356 + au32H[i]
19357 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19358 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19359 au32B[i + 1] = au32A[i];
19360 au32C[i + 1] = au32B[i];
19361 au32D[i + 1] = au32C[i];
19362 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19363 + iemAImpl_sha256_upper_sigma1(au32E[i])
19364 + au32WK[i]
19365 + au32H[i]
19366 + au32D[i];
19367 au32F[i + 1] = au32E[i];
19368 au32G[i + 1] = au32F[i];
19369 au32H[i + 1] = au32G[i];
19370 }
19371
19372 puDst->au32[3] = au32A[2];
19373 puDst->au32[2] = au32B[2];
19374 puDst->au32[1] = au32E[2];
19375 puDst->au32[0] = au32F[2];
19376}
19377
19378
19379/**
19380 * ADCX
19381 */
19382#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19383 do \
19384 { \
19385 bool f = RT_BOOL(fEFlags & (a_Flag)); \
19386 a_Type uTmp = *puDst + uSrc; \
19387 if (uTmp < uSrc) \
19388 fEFlags |= (a_Flag); \
19389 else \
19390 fEFlags &= ~(a_Flag); \
19391 if ( uTmp == a_Max \
19392 && f) \
19393 fEFlags |= (a_Flag); \
19394 if (f) \
19395 uTmp++; \
19396 *puDst = uTmp; \
19397 } \
19398 while (0)
19399
19400IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u32_fallback,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19401{
19402 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19403 return fEFlags;
19404}
19405
19406IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u64_fallback,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19407{
19408 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19409 return fEFlags;
19410}
19411
19412# if defined(IEM_WITHOUT_ASSEMBLY)
19413
19414IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19415{
19416 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19417 return fEFlags;
19418}
19419
19420IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19421{
19422 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19423 return fEFlags;
19424}
19425
19426#endif
19427
19428
19429/**
19430 * ADOX
19431 */
19432IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u32_fallback,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19433{
19434 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19435 return fEFlags;
19436}
19437
19438IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u64_fallback,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19439{
19440 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19441 return fEFlags;
19442}
19443
19444# if defined(IEM_WITHOUT_ASSEMBLY)
19445
19446IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19447{
19448 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19449 return fEFlags;
19450}
19451
19452IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19453{
19454 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19455 return fEFlags;
19456}
19457
19458# endif
19459
19460
19461/**
19462 * MPSADBW
19463 */
19464IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19465{
19466 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19467 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19468 int16_t ai16Src1[11];
19469 int16_t ai16Src2[4];
19470
19471 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19472 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19473
19474 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19475 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19476
19477 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19478 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19479 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19480 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19481 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19482}
19483
19484
19485IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19486{
19487 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19488 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19489 int16_t ai16Src1[11];
19490 int16_t ai16Src2[4];
19491
19492 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19493 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19494
19495 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19496 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19497
19498 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19499 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19500 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19501 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19502 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19503}
19504
19505
19506IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19507{
19508 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19509 RTUINT256U const uSrc2 = *puSrc2;
19510 ASMCompilerBarrier();
19511 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19512 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19513}
19514
19515
19516/**
19517 * VPERM2I128
19518 */
19519IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19520{
19521 if (bImm & RT_BIT(3))
19522 {
19523 puDst->au64[0] = 0;
19524 puDst->au64[1] = 0;
19525 }
19526 else
19527 {
19528 switch (bImm & 0x3)
19529 {
19530 case 0:
19531 puDst->au64[0] = puSrc1->au64[0];
19532 puDst->au64[1] = puSrc1->au64[1];
19533 break;
19534 case 1:
19535 puDst->au64[0] = puSrc1->au64[2];
19536 puDst->au64[1] = puSrc1->au64[3];
19537 break;
19538 case 2:
19539 puDst->au64[0] = puSrc2->au64[0];
19540 puDst->au64[1] = puSrc2->au64[1];
19541 break;
19542 case 3:
19543 puDst->au64[0] = puSrc2->au64[2];
19544 puDst->au64[1] = puSrc2->au64[3];
19545 break;
19546 }
19547 }
19548
19549 if (bImm & RT_BIT(7))
19550 {
19551 puDst->au64[2] = 0;
19552 puDst->au64[3] = 0;
19553 }
19554 else
19555 {
19556 switch ((bImm >> 4) & 0x3)
19557 {
19558 case 0:
19559 puDst->au64[2] = puSrc1->au64[0];
19560 puDst->au64[3] = puSrc1->au64[1];
19561 break;
19562 case 1:
19563 puDst->au64[2] = puSrc1->au64[2];
19564 puDst->au64[3] = puSrc1->au64[3];
19565 break;
19566 case 2:
19567 puDst->au64[2] = puSrc2->au64[0];
19568 puDst->au64[3] = puSrc2->au64[1];
19569 break;
19570 case 3:
19571 puDst->au64[2] = puSrc2->au64[2];
19572 puDst->au64[3] = puSrc2->au64[3];
19573 break;
19574 }
19575 }
19576}
19577
19578
19579/**
19580 * VPERM2F128
19581 */
19582IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19583{
19584 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19585}
19586
19587
19588/**
19589 * DPPS
19590 */
19591IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dpps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19592{
19593 RT_NOREF(puDst, pSrc, bImm);
19594 AssertReleaseFailed();
19595 return uMxCsrIn;
19596}
19597
19598
19599/**
19600 * DPPD
19601 */
19602IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dppd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19603{
19604 RT_NOREF(puDst, pSrc, bImm);
19605 AssertReleaseFailed();
19606 return uMxCsrIn;
19607}
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use