[96240] | 1 | ; $Id: sincore.asm 98103 2023-01-17 14:15:46Z vboxsync $
|
---|
| 2 | ;; @file
|
---|
| 3 | ; IPRT - No-CRT common sin & cos - AMD64 & X86.
|
---|
| 4 | ;
|
---|
| 5 |
|
---|
| 6 | ;
|
---|
[98103] | 7 | ; Copyright (C) 2022-2023 Oracle and/or its affiliates.
|
---|
[96240] | 8 | ;
|
---|
[96407] | 9 | ; This file is part of VirtualBox base platform packages, as
|
---|
| 10 | ; available from https://www.virtualbox.org.
|
---|
[96240] | 11 | ;
|
---|
[96407] | 12 | ; This program is free software; you can redistribute it and/or
|
---|
| 13 | ; modify it under the terms of the GNU General Public License
|
---|
| 14 | ; as published by the Free Software Foundation, in version 3 of the
|
---|
| 15 | ; License.
|
---|
| 16 | ;
|
---|
| 17 | ; This program is distributed in the hope that it will be useful, but
|
---|
| 18 | ; WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
| 20 | ; General Public License for more details.
|
---|
| 21 | ;
|
---|
| 22 | ; You should have received a copy of the GNU General Public License
|
---|
| 23 | ; along with this program; if not, see <https://www.gnu.org/licenses>.
|
---|
| 24 | ;
|
---|
[96240] | 25 | ; The contents of this file may alternatively be used under the terms
|
---|
| 26 | ; of the Common Development and Distribution License Version 1.0
|
---|
[96407] | 27 | ; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
|
---|
| 28 | ; in the VirtualBox distribution, in which case the provisions of the
|
---|
[96240] | 29 | ; CDDL are applicable instead of those of the GPL.
|
---|
| 30 | ;
|
---|
| 31 | ; You may elect to license modified versions of this file under the
|
---|
| 32 | ; terms and conditions of either the GPL or the CDDL or both.
|
---|
| 33 | ;
|
---|
[96407] | 34 | ; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
|
---|
| 35 | ;
|
---|
[96240] | 36 |
|
---|
| 37 |
|
---|
| 38 | %define RT_ASM_WITH_SEH64
|
---|
| 39 | %include "iprt/asmdefs.mac"
|
---|
| 40 | %include "iprt/x86.mac"
|
---|
| 41 |
|
---|
| 42 |
|
---|
| 43 | BEGINCODE
|
---|
| 44 |
|
---|
| 45 | ;;
|
---|
| 46 | ; Internal sine and cosine worker that calculates the sine of st0 returning
|
---|
| 47 | ; it in st0.
|
---|
| 48 | ;
|
---|
| 49 | ; When called by a sine function, fabs(st0) >= pi/2.
|
---|
| 50 | ; When called by a cosine function, fabs(original input value) >= 3pi/8.
|
---|
| 51 | ;
|
---|
| 52 | ; That the input isn't a tiny number close to zero, means that we can do a bit
|
---|
| 53 | ; cruder rounding when operating close to a pi/2 boundrary. The value in the
|
---|
| 54 | ; ecx register indicates the input precision and controls the crudeness of the
|
---|
| 55 | ; rounding.
|
---|
| 56 | ;
|
---|
| 57 | ; @returns st0 = sine
|
---|
| 58 | ; @param st0 A finite number to calucate sine of.
|
---|
| 59 | ; @param ecx Set to 0 if original input was a 32-bit float.
|
---|
| 60 | ; Set to 1 if original input was a 64-bit double.
|
---|
| 61 | ; set to 2 if original input was a 80-bit long double.
|
---|
| 62 | ;
|
---|
| 63 | BEGINPROC rtNoCrtMathSinCore
|
---|
| 64 | push xBP
|
---|
| 65 | SEH64_PUSH_xBP
|
---|
| 66 | mov xBP, xSP
|
---|
| 67 | SEH64_SET_FRAME_xBP 0
|
---|
| 68 | SEH64_END_PROLOGUE
|
---|
| 69 |
|
---|
| 70 | ;
|
---|
| 71 | ; Load the pointer to the rounding crudeness factor into xDX.
|
---|
| 72 | ;
|
---|
| 73 | lea xDX, [.s_ar64NearZero xWrtRIP]
|
---|
| 74 | lea xDX, [xDX + xCX * xCB]
|
---|
| 75 |
|
---|
| 76 | ;
|
---|
| 77 | ; Finite number. We want it in the range [0,2pi] and will preform
|
---|
| 78 | ; a remainder division if it isn't.
|
---|
| 79 | ;
|
---|
| 80 | fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
|
---|
| 81 | fnstsw ax
|
---|
| 82 | test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
|
---|
| 83 | jz .reduce_st0 ; Jump if st0 > mem
|
---|
| 84 |
|
---|
| 85 | fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
|
---|
| 86 | fnstsw ax
|
---|
| 87 | test ax, X86_FSW_C3 | X86_FSW_C0
|
---|
| 88 | jnz .reduce_st0 ; Jump if st0 <= mem
|
---|
| 89 |
|
---|
| 90 | ;
|
---|
| 91 | ; We get here if st0 is in the [0,2pi] range.
|
---|
| 92 | ;
|
---|
| 93 | ; Now, FSIN is documented to be reasonably accurate for the range
|
---|
| 94 | ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
|
---|
| 95 | ; in that range only.
|
---|
| 96 | ;
|
---|
| 97 | .in_range:
|
---|
| 98 | ; if (st0 < pi)
|
---|
| 99 | fldpi
|
---|
| 100 | fcom st1 ; compares st0 (pi) with st1 (the normalized value)
|
---|
| 101 | fnstsw ax
|
---|
| 102 | test ax, X86_FSW_C0 ; st1 > pi
|
---|
| 103 | jnz .larger_than_pi
|
---|
| 104 | test ax, X86_FSW_C3
|
---|
| 105 | jnz .equals_pi
|
---|
| 106 |
|
---|
| 107 | ;
|
---|
| 108 | ; input in the range [0,pi[
|
---|
| 109 | ;
|
---|
| 110 | .smaller_than_pi:
|
---|
| 111 | fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
|
---|
| 112 |
|
---|
| 113 | ; if (st0 < pi/2)
|
---|
| 114 | fcom st1 ; compares st0 (pi/2) with st1
|
---|
| 115 | fnstsw ax
|
---|
| 116 | test ax, X86_FSW_C0 ; st1 > pi
|
---|
| 117 | jnz .between_half_pi_and_pi
|
---|
| 118 | test ax, X86_FSW_C3
|
---|
| 119 | jnz .equals_half_pi
|
---|
| 120 |
|
---|
| 121 | ;
|
---|
| 122 | ; The value is between zero and half pi, including the zero value.
|
---|
| 123 | ;
|
---|
| 124 | ; This is in range where FSIN works reasonably reliably. So drop the
|
---|
| 125 | ; half pi in st0 and do the calculation.
|
---|
| 126 | ;
|
---|
| 127 | .between_zero_and_half_pi:
|
---|
| 128 | ; Check if we're so close to pi/2 that it makes no difference.
|
---|
| 129 | fsub st0, st1 ; st0 = pi/2 - st1
|
---|
| 130 | fcom qword [xDX]
|
---|
| 131 | fnstsw ax
|
---|
| 132 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 133 | jnz .equals_half_pi
|
---|
| 134 | ffreep st0
|
---|
| 135 |
|
---|
| 136 | ; Check if we're so close to zero that it makes no difference given the
|
---|
| 137 | ; internal accuracy of the FPU.
|
---|
| 138 | fcom qword [xDX]
|
---|
| 139 | fnstsw ax
|
---|
| 140 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 141 | jnz .equals_zero_popped_one
|
---|
| 142 |
|
---|
| 143 | ; Ok, calculate sine.
|
---|
| 144 | fsin
|
---|
| 145 | jmp .return
|
---|
| 146 |
|
---|
| 147 | ;
|
---|
| 148 | ; The value is in the range ]pi/2,pi[
|
---|
| 149 | ;
|
---|
| 150 | ; This is outside the comfortable FSIN range, but if we subtract PI and
|
---|
| 151 | ; move to the ]-pi/2,0[ range we just have to change the sign to get
|
---|
| 152 | ; the value we want.
|
---|
| 153 | ;
|
---|
| 154 | .between_half_pi_and_pi:
|
---|
| 155 | ; Check if we're so close to pi/2 that it makes no difference.
|
---|
| 156 | fsubr st0, st1 ; st0 = st1 - st0
|
---|
| 157 | fcom qword [xDX]
|
---|
| 158 | fnstsw ax
|
---|
| 159 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 160 | jnz .equals_half_pi
|
---|
| 161 | ffreep st0
|
---|
| 162 |
|
---|
| 163 | ; Check if we're so close to pi that it makes no difference.
|
---|
| 164 | fldpi
|
---|
| 165 | fsub st0, st1 ; st0 = st0 - st1
|
---|
| 166 | fcom qword [xDX]
|
---|
| 167 | fnstsw ax
|
---|
| 168 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 169 | jnz .equals_pi
|
---|
| 170 | ffreep st0
|
---|
| 171 |
|
---|
| 172 | ; Ok, transform the value and calculate sine.
|
---|
| 173 | fldpi
|
---|
| 174 | fsubp st1, st0
|
---|
| 175 |
|
---|
| 176 | fsin
|
---|
| 177 | fchs
|
---|
| 178 | jmp .return
|
---|
| 179 |
|
---|
| 180 | ;
|
---|
| 181 | ; input in the range ]pi,2pi[
|
---|
| 182 | ;
|
---|
| 183 | .larger_than_pi:
|
---|
| 184 | fsub st1, st0 ; st1 -= pi
|
---|
| 185 | fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
|
---|
| 186 |
|
---|
| 187 | ; if (st0 < pi/2)
|
---|
| 188 | fcom st1 ; compares st0 (pi/2) with reduced st1
|
---|
| 189 | fnstsw ax
|
---|
| 190 | test ax, X86_FSW_C0 ; st1 > pi
|
---|
| 191 | jnz .between_3_half_pi_and_2pi
|
---|
| 192 | test ax, X86_FSW_C3
|
---|
| 193 | jnz .equals_3_half_pi
|
---|
| 194 |
|
---|
| 195 | ;
|
---|
| 196 | ; The value is in the the range: ]pi,3pi/2[
|
---|
| 197 | ;
|
---|
| 198 | ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
|
---|
| 199 | ; and we can get the desired result by changing the sign (-FSIN).
|
---|
| 200 | ;
|
---|
| 201 | .between_pi_and_3_half_pi:
|
---|
| 202 | ; Check if we're so close to pi/2 that it makes no difference.
|
---|
| 203 | fsub st0, st1 ; st0 = pi/2 - st1
|
---|
| 204 | fcom qword [xDX]
|
---|
| 205 | fnstsw ax
|
---|
| 206 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 207 | jnz .equals_3_half_pi
|
---|
| 208 | ffreep st0
|
---|
| 209 |
|
---|
| 210 | ; Check if we're so close to zero that it makes no difference given the
|
---|
| 211 | ; internal accuracy of the FPU.
|
---|
| 212 | fcom qword [xDX]
|
---|
| 213 | fnstsw ax
|
---|
| 214 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 215 | jnz .equals_pi_popped
|
---|
| 216 |
|
---|
| 217 | ; Ok, calculate sine and flip the sign.
|
---|
| 218 | fsin
|
---|
| 219 | fchs
|
---|
| 220 | jmp .return
|
---|
| 221 |
|
---|
| 222 | ;
|
---|
| 223 | ; The value is in the last pi/2 of the range: ]3pi/2,2pi[
|
---|
| 224 | ;
|
---|
| 225 | ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
|
---|
| 226 | ; subtract pi again (we subtracted pi at .larger_than_pi above) and
|
---|
| 227 | ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
|
---|
| 228 | ;
|
---|
| 229 | .between_3_half_pi_and_2pi:
|
---|
| 230 | ; Check if we're so close to pi/2 that it makes no difference.
|
---|
| 231 | fsubr st0, st1 ; st0 = st1 - st0
|
---|
| 232 | fcom qword [xDX]
|
---|
| 233 | fnstsw ax
|
---|
| 234 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 235 | jnz .equals_3_half_pi
|
---|
| 236 | ffreep st0
|
---|
| 237 |
|
---|
| 238 | ; Check if we're so close to pi that it makes no difference.
|
---|
| 239 | fldpi
|
---|
| 240 | fsub st0, st1 ; st0 = st0 - st1
|
---|
| 241 | fcom qword [xDX]
|
---|
| 242 | fnstsw ax
|
---|
| 243 | test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
|
---|
| 244 | jnz .equals_2pi
|
---|
| 245 | ffreep st0
|
---|
| 246 |
|
---|
| 247 | ; Ok, adjust input and calculate sine.
|
---|
| 248 | fldpi
|
---|
| 249 | fsubp st1, st0
|
---|
| 250 | fsin
|
---|
| 251 | jmp .return
|
---|
| 252 |
|
---|
| 253 | ;
|
---|
| 254 | ; sin(0) = 0
|
---|
| 255 | ; sin(pi) = 0
|
---|
| 256 | ;
|
---|
| 257 | .equals_zero:
|
---|
| 258 | .equals_pi:
|
---|
| 259 | .equals_2pi:
|
---|
| 260 | ffreep st0
|
---|
| 261 | .equals_zero_popped_one:
|
---|
| 262 | .equals_pi_popped:
|
---|
| 263 | ffreep st0
|
---|
| 264 | fldz
|
---|
| 265 | jmp .return
|
---|
| 266 |
|
---|
| 267 | ;
|
---|
| 268 | ; sin(pi/2) = 1
|
---|
| 269 | ;
|
---|
| 270 | .equals_half_pi:
|
---|
| 271 | ffreep st0
|
---|
| 272 | ffreep st0
|
---|
| 273 | fld1
|
---|
| 274 | jmp .return
|
---|
| 275 |
|
---|
| 276 | ;
|
---|
| 277 | ; sin(3*pi/2) = -1
|
---|
| 278 | ;
|
---|
| 279 | .equals_3_half_pi:
|
---|
| 280 | ffreep st0
|
---|
| 281 | ffreep st0
|
---|
| 282 | fld1
|
---|
| 283 | fchs
|
---|
| 284 | jmp .return
|
---|
| 285 |
|
---|
| 286 | ;
|
---|
| 287 | ; Return.
|
---|
| 288 | ;
|
---|
| 289 | .return:
|
---|
| 290 | leave
|
---|
| 291 | ret
|
---|
| 292 |
|
---|
| 293 | ;
|
---|
| 294 | ; Reduce st0 by reminder division by PI*2. The result should be positive here.
|
---|
| 295 | ;
|
---|
| 296 | ;; @todo this is one of our weak spots (really any calculation involving PI is).
|
---|
| 297 | .reduce_st0:
|
---|
| 298 | fldpi
|
---|
| 299 | fadd st0, st0
|
---|
| 300 | fxch st1 ; st0=input (dividend) st1=2pi (divisor)
|
---|
| 301 | .again:
|
---|
| 302 | fprem1
|
---|
| 303 | fnstsw ax
|
---|
| 304 | test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
|
---|
| 305 | jnz .again ; Loop till C2 == 0 and we have a final result.
|
---|
| 306 |
|
---|
| 307 | ;
|
---|
| 308 | ; Make sure the result is positive.
|
---|
| 309 | ;
|
---|
| 310 | fxam
|
---|
| 311 | fnstsw ax
|
---|
| 312 | test ax, X86_FSW_C1 ; The sign bit
|
---|
| 313 | jz .reduced_to_positive
|
---|
| 314 |
|
---|
| 315 | fadd st0, st1 ; st0 += 2pi, which should make it positive
|
---|
| 316 |
|
---|
| 317 | %ifdef RT_STRICT
|
---|
| 318 | fxam
|
---|
| 319 | fnstsw ax
|
---|
| 320 | test ax, X86_FSW_C1
|
---|
| 321 | jz .reduced_to_positive
|
---|
| 322 | int3
|
---|
| 323 | %endif
|
---|
| 324 |
|
---|
| 325 | .reduced_to_positive:
|
---|
| 326 | fstp st1 ; Get rid of the 2pi value.
|
---|
| 327 | jmp .in_range
|
---|
| 328 |
|
---|
| 329 | ALIGNCODE(8)
|
---|
| 330 | .s_r64Max:
|
---|
| 331 | dq +6.28318530717958647692 ; 2*pi
|
---|
| 332 | .s_r64Min:
|
---|
| 333 | dq 0.0
|
---|
| 334 | .s_r64Two:
|
---|
| 335 | dq 2.0
|
---|
| 336 | ;;
|
---|
| 337 | ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
|
---|
| 338 | ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
|
---|
| 339 | ; PI constant used during reduction/whatever, I think we can round to a whole pi/2
|
---|
| 340 | ; step when we get close enough.
|
---|
| 341 | ;
|
---|
| 342 | ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
|
---|
| 343 | ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
|
---|
| 344 | ; we only have to set the exponent to get a valid number.
|
---|
| 345 | ;
|
---|
| 346 | .s_ar64NearZero:
|
---|
| 347 | ;; @todo check how sensible these really are...
|
---|
| 348 | dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
|
---|
| 349 | dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
|
---|
| 350 | dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
|
---|
| 351 | ENDPROC rtNoCrtMathSinCore
|
---|
| 352 |
|
---|