Changeset 94538 in vbox
- Timestamp:
- Apr 10, 2022 2:16:03 PM (2 years ago)
- Location:
- trunk/src/VBox/VMM
- Files:
-
- 1 added
- 2 edited
-
VMMAll/IEMAllAImplC.cpp (modified) (4 diffs)
-
VMMAll/IEMAllInstructionsOneByte.cpp.h (modified) (1 diff)
-
tools/IEMGenFpuConstants.c (added)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp
r94447 r94538 25 25 #include <iprt/x86.h> 26 26 #include <iprt/uint128.h> 27 #include <iprt/uint256.h> 28 29 RT_C_DECLS_BEGIN 30 #include <softfloat.h> 31 RT_C_DECLS_END 27 32 28 33 … … 442 447 443 448 449 /** Zero values (indexed by fSign). */ 450 RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) }; 451 452 /** One values (indexed by fSign). */ 453 RTFLOAT80U const g_ar80One[] = 454 { RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) }; 455 456 /** Indefinite (negative). */ 457 RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1); 458 459 /** 128-bit floating point constant: 2.0 */ 460 const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1); 461 462 463 /* The next section is generated by tools/IEMGenFpuConstants: */ 464 465 /** The ln2 constant as 128-bit floating point value. 466 * base-10: 6.93147180559945309417232121458176575e-1 467 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1 468 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */ 469 //const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe); 470 const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe); 471 /** High precision ln2 value. 472 * base-10: 6.931471805599453094172321214581765680747e-1 473 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1 474 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */ 475 const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af); 476 /** High precision ln2 value, compatible with f2xm1 results on intel 10980XE. 477 * base-10: 6.931471805599453094151379470289064954613e-1 478 * base-16: b.17217f7d1cf79abc0000000000000000@-1 479 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */ 480 const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000); 481 482 /** Horner constants for f2xm1 */ 483 const RTFLOAT128U g_ar128F2xm1HornerConsts[] = 484 { 485 /* a0 486 * base-10: 1.00000000000000000000000000000000000e0 487 * base-16: 1.0000000000000000000000000000@0 488 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */ 489 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff), 490 /* a1 491 * base-10: 5.00000000000000000000000000000000000e-1 492 * base-16: 8.0000000000000000000000000000@-1 493 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */ 494 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe), 495 /* a2 496 * base-10: 1.66666666666666666666666666666666658e-1 497 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1 498 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */ 499 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc), 500 /* a3 501 * base-10: 4.16666666666666666666666666666666646e-2 502 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2 503 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */ 504 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa), 505 /* a4 506 * base-10: 8.33333333333333333333333333333333323e-3 507 * base-16: 2.2222222222222222222222222222@-2 508 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */ 509 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8), 510 /* a5 511 * base-10: 1.38888888888888888888888888888888874e-3 512 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3 513 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */ 514 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5), 515 /* a6 516 * base-10: 1.98412698412698412698412698412698412e-4 517 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4 518 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */ 519 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2), 520 /* a7 521 * base-10: 2.48015873015873015873015873015873015e-5 522 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4 523 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */ 524 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef), 525 /* a8 526 * base-10: 2.75573192239858906525573192239858902e-6 527 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5 528 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */ 529 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec), 530 /* a9 531 * base-10: 2.75573192239858906525573192239858865e-7 532 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6 533 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */ 534 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9), 535 /* a10 536 * base-10: 2.50521083854417187750521083854417184e-8 537 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7 538 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */ 539 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5), 540 /* a11 541 * base-10: 2.08767569878680989792100903212014296e-9 542 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8 543 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */ 544 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2), 545 /* a12 546 * base-10: 1.60590438368216145993923771701549472e-10 547 * base-16: b.092309d43684be51c198e91d7b40@-9 548 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */ 549 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde), 550 /* a13 551 * base-10: 1.14707455977297247138516979786821043e-11 552 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10 553 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */ 554 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda), 555 /* a14 556 * base-10: 7.64716373181981647590113198578806964e-13 557 * base-16: d.73f9f399dc0f88ec32b587746578@-11 558 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */ 559 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6), 560 /* a15 561 * base-10: 4.77947733238738529743820749111754352e-14 562 * base-16: d.73f9f399dc0f88ec32b587746578@-12 563 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */ 564 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2), 565 /* a16 566 * base-10: 2.81145725434552076319894558301031970e-15 567 * base-16: c.a963b81856a53593028cbbb8d7f8@-13 568 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */ 569 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce), 570 /* a17 571 * base-10: 1.56192069685862264622163643500573321e-16 572 * base-16: b.413c31dcbecbbdd8024435161550@-14 573 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */ 574 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca), 575 /* a18 576 * base-10: 8.22063524662432971695598123687227980e-18 577 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15 578 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */ 579 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6), 580 /* a19 581 * base-10: 4.11031762331216485847799061843614006e-19 582 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16 583 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */ 584 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1), 585 /* a20 586 * base-10: 7.04351638180413298434020229233492164e-20 587 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16 588 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */ 589 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf), 590 /* a21 591 * base-10: 5.81527769640186708776361513365257702e-20 592 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16 593 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */ 594 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf), 595 }; 596 597 444 598 /* 445 599 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing … … 4181 4335 4182 4336 /********************************************************************************************************************************* 4337 * FPU Helpers * 4338 *********************************************************************************************************************************/ 4339 #ifdef IEM_WITH_FLOAT128_FOR_FPU 4340 4341 DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw) 4342 { 4343 int fNew; 4344 switch (fFcw & X86_FCW_RC_MASK) 4345 { 4346 default: 4347 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break; 4348 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break; 4349 case X86_FCW_RC_UP: fNew = FE_UPWARD; break; 4350 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break; 4351 } 4352 int fOld = fegetround(); 4353 fesetround(fNew); 4354 return fOld; 4355 } 4356 4357 4358 DECLINLINE(void) iemFpuF128RestoreRounding(int fOld) 4359 { 4360 fesetround(fOld); 4361 } 4362 4363 DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw) 4364 { 4365 RT_NOREF(fFcw); 4366 RTFLOAT128U Tmp; 4367 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent; 4368 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48); 4369 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16); 4370 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48; 4371 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val)) 4372 { 4373 Assert(Tmp.s.uExponent == 0); 4374 Tmp.s2.uSignAndExponent++; 4375 } 4376 return *(_Float128 *)&Tmp; 4377 } 4378 4379 4380 DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw) 4381 { 4382 RT_NOREF(fFcw); 4383 RTFLOAT128U Tmp; 4384 *(_Float128 *)&Tmp = rd128ValSrc; 4385 ASMCompilerBarrier(); 4386 if (RTFLOAT128U_IS_NORMAL(&Tmp)) 4387 { 4388 pr80Dst->s.fSign = Tmp.s64.fSign; 4389 pr80Dst->s.uExponent = Tmp.s64.uExponent; 4390 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48) 4391 | Tmp.s64.uFractionLo >> (64 - 15); 4392 4393 /* Do rounding - just truncate in near mode when midway on an even outcome. */ 4394 unsigned const cShiftOff = 64 - 15; 4395 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; 4396 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask; 4397 if (uRoundedOff) 4398 { 4399 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST 4400 ? RT_BIT_64(cShiftOff - 1) 4401 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) 4402 ? fRoundingOffMask 4403 : 0; 4404 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST 4405 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff)) 4406 || uRoundedOff != uRoundingAdd) 4407 { 4408 if ((uRoundedOff + uRoundingAdd) >> cShiftOff) 4409 { 4410 uFraction += 1; 4411 if (!(uFraction & RT_BIT_64(63))) 4412 { /* likely */ } 4413 else 4414 { 4415 uFraction >>= 1; 4416 pr80Dst->s.uExponent++; 4417 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX) 4418 return fFsw; 4419 } 4420 fFsw |= X86_FSW_C1; 4421 } 4422 } 4423 fFsw |= X86_FSW_PE; 4424 if (!(fFcw & X86_FCW_PM)) 4425 fFsw |= X86_FSW_ES | X86_FSW_B; 4426 } 4427 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction; 4428 } 4429 else if (RTFLOAT128U_IS_ZERO(&Tmp)) 4430 { 4431 pr80Dst->s.fSign = Tmp.s64.fSign; 4432 pr80Dst->s.uExponent = 0; 4433 pr80Dst->s.uMantissa = 0; 4434 } 4435 else if (RTFLOAT128U_IS_INF(&Tmp)) 4436 { 4437 pr80Dst->s.fSign = Tmp.s64.fSign; 4438 pr80Dst->s.uExponent = 0; 4439 pr80Dst->s.uMantissa = 0; 4440 } 4441 return fFsw; 4442 } 4443 4444 4445 #else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */ 4446 4447 4448 DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST) 4449 { 4450 RT_NOREF(fFcw); 4451 Assert(cBits > 64); 4452 # if 0 /* rounding does not seem to help */ 4453 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1); 4454 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1); 4455 if (off >= RT_BIT_64(1 + 112 - cBits - 1) 4456 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits))) 4457 { 4458 uint64_t uOld = r128.v[0]; 4459 r128.v[0] += RT_BIT_64(1 + 112 - cBits); 4460 if (r128.v[0] < uOld) 4461 r128.v[1] += 1; 4462 } 4463 # else 4464 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1); 4465 # endif 4466 return r128; 4467 } 4468 4469 4470 DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST) 4471 { 4472 RT_NOREF(fFcw); 4473 Assert(cBits > 64); 4474 # if 0 /* rounding does not seem to help, not even on constants */ 4475 float128_t r128 = { pr128->au64[0], pr128->au64[1] }; 4476 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1); 4477 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1); 4478 if (off >= RT_BIT_64(1 + 112 - cBits - 1) 4479 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits))) 4480 { 4481 uint64_t uOld = r128.v[0]; 4482 r128.v[0] += RT_BIT_64(1 + 112 - cBits); 4483 if (r128.v[0] < uOld) 4484 r128.v[1] += 1; 4485 } 4486 return r128; 4487 # else 4488 float128_t r128 = { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] }; 4489 return r128; 4490 # endif 4491 } 4492 4493 4494 DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128) 4495 { 4496 float128_t r128 = { pr128->au64[0], pr128->au64[1] }; 4497 return r128; 4498 } 4499 4500 4501 /** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */ 4502 DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val) 4503 { 4504 extFloat80_t Tmp; 4505 Tmp.signExp = pr80Val->s2.uSignAndExponent; 4506 Tmp.signif = pr80Val->s2.uMantissa; 4507 return extF80_to_f128(Tmp); 4508 } 4509 4510 4511 DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw) 4512 { 4513 RT_NOREF(fFcw); 4514 RTFLOAT128U Tmp; 4515 *(float128_t *)&Tmp = r128Src; 4516 ASMCompilerBarrier(); 4517 4518 if (RTFLOAT128U_IS_NORMAL(&Tmp)) 4519 { 4520 pr80Dst->s.fSign = Tmp.s64.fSign; 4521 pr80Dst->s.uExponent = Tmp.s64.uExponent; 4522 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48) 4523 | Tmp.s64.uFractionLo >> (64 - 15); 4524 4525 /* Do rounding - just truncate in near mode when midway on an even outcome. */ 4526 unsigned const cShiftOff = 64 - 15; 4527 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; 4528 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask; 4529 if (uRoundedOff) 4530 { 4531 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST 4532 ? RT_BIT_64(cShiftOff - 1) 4533 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) 4534 ? fRoundingOffMask 4535 : 0; 4536 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST 4537 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff)) 4538 || uRoundedOff != uRoundingAdd) 4539 { 4540 if ((uRoundedOff + uRoundingAdd) >> cShiftOff) 4541 { 4542 uFraction += 1; 4543 if (!(uFraction & RT_BIT_64(63))) 4544 { /* likely */ } 4545 else 4546 { 4547 uFraction >>= 1; 4548 pr80Dst->s.uExponent++; 4549 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX) 4550 return fFsw; 4551 } 4552 fFsw |= X86_FSW_C1; 4553 } 4554 } 4555 fFsw |= X86_FSW_PE; 4556 if (!(fFcw & X86_FCW_PM)) 4557 fFsw |= X86_FSW_ES | X86_FSW_B; 4558 } 4559 4560 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction; 4561 } 4562 else if (RTFLOAT128U_IS_ZERO(&Tmp)) 4563 { 4564 pr80Dst->s.fSign = Tmp.s64.fSign; 4565 pr80Dst->s.uExponent = 0; 4566 pr80Dst->s.uMantissa = 0; 4567 } 4568 else if (RTFLOAT128U_IS_INF(&Tmp)) 4569 { 4570 pr80Dst->s.fSign = Tmp.s64.fSign; 4571 pr80Dst->s.uExponent = 0; 4572 pr80Dst->s.uMantissa = 0; 4573 } 4574 return fFsw; 4575 } 4576 4577 4578 /** 4579 * Helper doing polynomial evaluation using Horner's method. 4580 * 4581 * See https://en.wikipedia.org/wiki/Horner%27s_method for details. 4582 */ 4583 float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts, unsigned cPrecision) 4584 { 4585 Assert(cHornerConsts > 1); 4586 size_t i = cHornerConsts - 1; 4587 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision); 4588 while (i-- > 0) 4589 { 4590 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z), cPrecision); 4591 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision)); 4592 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision); 4593 } 4594 return r128Result; 4595 } 4596 4597 #endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */ 4598 4599 4600 /** 4601 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide 4602 * mantissa, exponent and sign. 4603 * 4604 * @returns Updated FSW. 4605 * @param pr80Dst Where to return the composed value. 4606 * @param fSign The sign. 4607 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are 4608 * ignored and should be zero. This will probably be 4609 * modified during normalization and rounding. 4610 * @param iExponent Unbiased exponent. 4611 * @param fFcw The FPU control word. 4612 * @param fFsw The FPU status word. 4613 */ 4614 static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa, 4615 int32_t iExponent, uint16_t fFcw, uint16_t fFsw) 4616 { 4617 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0); 4618 4619 iExponent += RTFLOAT80U_EXP_BIAS; 4620 4621 /* Do normalization if necessary and possible. */ 4622 unsigned cShifted = 0; 4623 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63))) 4624 { 4625 int cShift = 192 - RTUInt256BitCount(puMantissa); 4626 if (iExponent > cShift) 4627 iExponent -= cShift; 4628 else 4629 { 4630 if (fFcw & X86_FCW_UM) 4631 { 4632 if (iExponent > 0) 4633 cShift = --iExponent; 4634 else 4635 cShift = 0; 4636 } 4637 iExponent -= cShift; 4638 } 4639 cShifted = cShift; 4640 RTUInt256AssignShiftLeft(puMantissa, cShift); 4641 } 4642 4643 /* Do rounding. */ 4644 uint64_t uMantissa = puMantissa->QWords.qw2; 4645 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0) 4646 { 4647 bool fAdd; 4648 switch (fFcw & X86_FCW_RC_MASK) 4649 { 4650 case X86_FCW_RC_NEAREST: 4651 if (puMantissa->QWords.qw1 & RT_BIT_64(63)) 4652 { 4653 if ( (uMantissa & 1) 4654 || puMantissa->QWords.qw0 != 0 4655 || puMantissa->QWords.qw1 != RT_BIT_64(63)) 4656 { 4657 fAdd = true; 4658 break; 4659 } 4660 uMantissa &= ~(uint64_t)1; 4661 } 4662 fAdd = false; 4663 break; 4664 case X86_FCW_RC_ZERO: 4665 fAdd = false; 4666 break; 4667 case X86_FCW_RC_UP: 4668 fAdd = !fSign; 4669 break; 4670 case X86_FCW_RC_DOWN: 4671 fAdd = fSign; 4672 break; 4673 } 4674 if (fAdd) 4675 { 4676 uint64_t const uTmp = uMantissa; 4677 uMantissa = uTmp + 1; 4678 if (uMantissa < uTmp) 4679 { 4680 uMantissa >>= 1; 4681 uMantissa |= RT_BIT_64(63); 4682 iExponent++; 4683 } 4684 fFsw |= X86_FSW_C1; 4685 } 4686 fFsw |= X86_FSW_PE; 4687 if (!(fFcw & X86_FCW_PM)) 4688 fFsw |= X86_FSW_ES | X86_FSW_B; 4689 } 4690 4691 /* Check for underflow (denormals). */ 4692 if (iExponent <= 0) 4693 { 4694 if (fFcw & X86_FCW_UM) 4695 { 4696 if (uMantissa & RT_BIT_64(63)) 4697 uMantissa >>= 1; 4698 iExponent = 0; 4699 } 4700 else 4701 { 4702 iExponent += RTFLOAT80U_EXP_BIAS_UNDERFLOW_ADJUST; 4703 fFsw |= X86_FSW_ES | X86_FSW_B; 4704 } 4705 fFsw |= X86_FSW_UE; 4706 } 4707 /* Check for overflow */ 4708 else if (iExponent >= RTFLOAT80U_EXP_MAX) 4709 { 4710 Assert(iExponent < RTFLOAT80U_EXP_MAX); 4711 } 4712 4713 /* Compose the result. */ 4714 pr80Dst->s.uMantissa = uMantissa; 4715 pr80Dst->s.uExponent = iExponent; 4716 pr80Dst->s.fSign = fSign; 4717 return fFsw; 4718 } 4719 4720 4721 4722 4723 /********************************************************************************************************************************* 4183 4724 * x86 FPU Division Operations * 4184 4725 *********************************************************************************************************************************/ … … 4702 5243 4703 5244 5245 AssertCompileSize(RTFLOAT128U, 16); 5246 AssertCompileSize(RTFLOAT80U, 10); 5247 AssertCompileSize(RTFLOAT64U, 8); 5248 AssertCompileSize(RTFLOAT32U, 4); 5249 5250 5251 /** 5252 * @code 5253 * x x * ln2 5254 * f(x) = 2 - 1 = e - 1 5255 * 5256 * @endcode 5257 * 5258 * We can approximate e^x by a Taylor/Maclaurin series: 5259 * @code 5260 * n 0 1 2 3 4 5261 * inf x x x x x x 5262 * SUM ----- = --- + --- + --- + --- + --- + ... 5263 * n=0 n! 0! 1! 2! 3! 4! 5264 * 5265 * 2 3 4 5266 * x x x 5267 * = 1 + x + --- + --- + --- + ... 5268 * 2! 3! 4! 5269 * @endcode 5270 * 5271 * Given z = x * ln2, we get: 5272 * @code 5273 * 2 3 4 n 5274 * z z z z z 5275 * e - 1 = z + --- + --- + --- + ... + --- 5276 * 2! 3! 4! n! 5277 * @endcode 5278 * 5279 * Wanting to use Horner's method, we move one z outside and get: 5280 * @code 5281 * 2 3 (n-1) 5282 * z z z z 5283 * = z ( 1 + --- + --- + --- + ... + ------- ) 5284 * 2! 3! 4! n! 5285 * @endcode 5286 * 5287 * The constants we need for using Horner's methods are 1 and 1 / n!. 5288 * 5289 * For very tiny x values, we can get away with f(x) = x * ln 2, because 5290 * because we don't have the necessary precision to represent 1.0 + z/3 + ... 5291 * and can approximate it to be 1.0. For a visual demonstration of this 5292 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long 5293 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2. 5294 * 5295 * 5296 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387 5297 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military 5298 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block) 5299 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter 5300 * blocks). (The one bit difference is probably an implicit one missing from 5301 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F. 5302 * Oberman states that it internally used a 68 bit mantissa with a 18-bit 5303 * exponent. 5304 * 5305 * However, even when sticking to 67 constants / 68 mantissas, I have not yet 5306 * successfully reproduced the exact results from an Intel 10980XE, there is 5307 * always a portition of rounding differences. Not going to spend too much time 5308 * on getting this 100% the same, at least not now. 5309 * 5310 * P.S. If someone are really curious about 8087 and its contstants: 5311 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html 5312 * 5313 * 5314 * @param pr80Val The exponent value (x), less than 1.0, greater than 5315 * -1.0 and not zero. This can be a normal, denormal 5316 * or pseudo-denormal value. 5317 * @param pr80Result Where to return the result. 5318 * @param fFcw FPU control word. 5319 * @param fFsw FPU status word. 5320 */ 5321 static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw) 5322 { 5323 /* As mentioned above, we can skip the expensive polynomial calculation 5324 as it will be close enough to 1.0 that it makes no difference. 5325 5326 The cutoff point for intel 10980XE is exponents >= -69. Intel 5327 also seems to be using a 67-bit or 68-bit constant value, and we get 5328 a smattering of rounding differences if we go for higher precision. */ 5329 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69) 5330 { 5331 RTUINT256U u256; 5332 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa); 5333 u256.QWords.qw0 |= 1; /* force #PE */ 5334 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256, 5335 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val) 5336 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS 5337 : 1 - RTFLOAT80U_EXP_BIAS, 5338 fFcw, fFsw); 5339 } 5340 else 5341 { 5342 #ifdef IEM_WITH_FLOAT128_FOR_FPU 5343 /* This approach is not good enough for small values, we end up with zero. */ 5344 int const fOldRounding = iemFpuF128SetRounding(fFcw); 5345 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw); 5346 _Float128 rd128Result = powf128(2.0L, rd128Val); 5347 rd128Result -= 1.0L; 5348 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw); 5349 iemFpuF128RestoreRounding(fOldRounding); 5350 5351 # else 5352 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val); 5353 5354 /* As mentioned above, enforce 68-bit internal mantissa width to better 5355 match the Intel 10980XE results. */ 5356 unsigned const cPrecision = 68; 5357 5358 /* first calculate z = x * ln2 */ 5359 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision)), cPrecision); 5360 5361 /* Then do the polynomial evaluation. */ 5362 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts), cPrecision); 5363 r = f128_mul(z, r); 5364 5365 /* Output the result. */ 5366 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw); 5367 # endif 5368 } 5369 return fFsw; 5370 } 5371 5372 4704 5373 IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val)) 4705 5374 { 4706 RT_NOREF(pFpuState, pFpuRes, pr80Val); 4707 AssertReleaseFailed(); 5375 uint16_t const fFcw = pFpuState->FCW; 5376 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT); 5377 5378 if (RTFLOAT80U_IS_NORMAL(pr80Val)) 5379 { 5380 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS) 5381 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw); 5382 else 5383 { 5384 /* Special case: 5385 2^+1.0 - 1.0 = 1.0 5386 2^-1.0 - 1.0 = -0.5 */ 5387 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS 5388 && pr80Val->s.uMantissa == RT_BIT_64(63)) 5389 { 5390 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63); 5391 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign; 5392 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign; 5393 } 5394 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */ 5395 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */ 5396 else 5397 pFpuRes->r80Result = *pr80Val; 5398 fFsw |= X86_FSW_PE; 5399 if (!(fFcw & X86_FCW_PM)) 5400 fFsw |= X86_FSW_ES | X86_FSW_B; 5401 } 5402 } 5403 else if ( RTFLOAT80U_IS_ZERO(pr80Val) 5404 || RTFLOAT80U_IS_QUIET_NAN(pr80Val) 5405 || RTFLOAT80U_IS_INDEFINITE(pr80Val)) 5406 pFpuRes->r80Result = *pr80Val; 5407 else if (RTFLOAT80U_IS_INF(pr80Val)) 5408 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val; 5409 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val)) 5410 { 5411 fFsw |= X86_FSW_DE; 5412 if (fFcw & X86_FCW_DM) 5413 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw); 5414 else 5415 { 5416 pFpuRes->r80Result = *pr80Val; 5417 fFsw |= X86_FSW_ES | X86_FSW_B; 5418 } 5419 } 5420 else 5421 { 5422 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val) 5423 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val)) 5424 && (fFcw & X86_FCW_IM)) 5425 pFpuRes->r80Result = g_r80Indefinite; 5426 else 5427 { 5428 pFpuRes->r80Result = *pr80Val; 5429 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM)) 5430 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */ 5431 } 5432 fFsw |= X86_FSW_IE; 5433 if (!(fFcw & X86_FCW_IM)) 5434 fFsw |= X86_FSW_ES | X86_FSW_B; 5435 } 5436 pFpuRes->FSW = fFsw; 4708 5437 } 4709 5438 -
trunk/src/VBox/VMM/VMMAll/IEMAllInstructionsOneByte.cpp.h
r94440 r94538 7921 7921 7922 7922 7923 /** Opcode 0xd9 0xf0. */ 7923 /** Opcode 0xd9 0xf0. 7924 * 7925 * The f2xm1 instruction works on values +1.0 thru -1.0, currently (the range on 7926 * 287 & 8087 was +0.5 thru 0.0 according to docs). In addition is does appear 7927 * to produce proper results for +Inf and -Inf. 7928 * 7929 * This is probably usful in the implementation pow() and similar. 7930 */ 7924 7931 FNIEMOP_DEF(iemOp_f2xm1) 7925 7932 {
Note:
See TracChangeset
for help on using the changeset viewer.

