; $Id: log2f.asm 98103 2023-01-17 14:15:46Z vboxsync $
;; @file
; IPRT - No-CRT log2f - AMD64 & X86.
;

;
; Copyright (C) 2006-2023 Oracle and/or its affiliates.
;
; This file is part of VirtualBox base platform packages, as
; available from https://www.virtualbox.org.
;
; This program is free software; you can redistribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation, in version 3 of the
; License.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, see <https://www.gnu.org/licenses>.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
; in the VirtualBox distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;
; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
;


%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"
%include "iprt/x86.mac"


BEGINCODE

extern NAME(RT_NOCRT(feraiseexcept))

;;
; Compute the log2f of rf
; @returns st(0) / xmm0
; @param    rf      [xSP + xCB*2] / xmm0
RT_NOCRT_BEGINPROC log2f
        push    xBP
        SEH64_PUSH_xBP
        mov     xBP, xSP
        SEH64_SET_FRAME_xBP 0
        sub     xSP, 20h
        SEH64_ALLOCATE_STACK 20h
        SEH64_END_PROLOGUE

        ;
        ; Load the input into st0.
        ;
%ifdef RT_ARCH_AMD64
        movss   [xBP - 10h], xmm0
        fld     dword [xBP - 10h]
%else
        fld     dword [xBP + xCB*2]
%endif

        ;
        ; Weed out non-normal values.
        ;
        fxam
        fnstsw  ax
        mov     cx, ax
        and     ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
        cmp     ax, X86_FSW_C2              ; Normal finite number (excluding zero)
        je      .finite
        cmp     ax, X86_FSW_C3              ; Zero
        je      .zero
        cmp     ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals
        je      .finite
        cmp     ax, X86_FSW_C0 | X86_FSW_C2 ; Infinity.
        je      .inf
        jmp     .nan

.finite:
        ; Negative number?
        test    cx, X86_FSW_C1
        jnz     .negative

        ; Is it +1.0?
        fld1
        fcomip  st1
        jz      .plus_one

        ;
        ; The fyl2xp1 instruction (ST1=ST1*log2(ST0+1.0), popping ST0) has a
        ; valid ST0 range of 1(1-sqrt(0.5)) (approx 0.29289321881) on both
        ; sides of zero.  We try use it if we can.
        ;
.above_one:
        ; For both fyl2xp1 and fyl2xp1 we need st1=1.0.
        fld1
        fxch    st0, st1                    ; -> st0=input; st1=1.0

        ; Check if the input is within the fyl2xp1 range.
        fld     qword [.s_r64AbsFyL2xP1InputMax xWrtRIP]
        fcomip  st0, st1
        jbe     .cannot_use_fyl2xp1

        fld     qword [.s_r64AbsFyL2xP1InputMin xWrtRIP]
        fcomip  st0, st1
        jae     .cannot_use_fyl2xp1

        ; Do the calculation.
.use_fyl2xp1:
        fsub    st0, st1                    ; -> st0=input-1; st1=1.0
        fyl2xp1                             ; -> st0=1.0*log2(st0+1.0)
        jmp     .return_val

.cannot_use_fyl2xp1:
        fyl2x                               ; -> st0=1.0*log2(st0)

        ;
        ; Run st0.
        ;
.return_val:
%ifdef RT_ARCH_AMD64
        fstp    dword [xBP - 10h]
        movss   xmm0, [xBP - 10h]
%endif
.return:
        leave
        ret


        ;
        ; +1.0: Return +0.0.
        ;
.plus_one:
        ffreep  st0
        fldz
        jmp     .return_val

        ;
        ; Negative numbers: Return NaN and raise invalid operation.
        ;
.negative:
.minus_inf:
        ; Raise invalid operation
%ifdef RT_ARCH_X86
        mov     dword [xSP], X86_FSW_IE
%elifdef ASM_CALL64_GCC
        mov     edi, X86_FSW_IE
%elifdef ASM_CALL64_MSC
        mov     ecx, X86_FSW_IE
%else
 %error calling conv.
%endif
        call    NAME(RT_NOCRT(feraiseexcept))

        ; Load NaN
%ifdef RT_ARCH_AMD64
        movss   xmm0, [.s_r32NaN xWrtRIP]
%else
        fld     dword [.s_r32NaN xWrtRIP]
%endif
        jmp     .return

        ;
        ; +/-0.0: Return inf and raise divide by zero error.
        ;
.zero:
        ffreep  st0

        ; Raise div/0
%ifdef RT_ARCH_X86
        mov     dword [xSP], X86_FSW_ZE
%elifdef ASM_CALL64_GCC
        mov     edi, X86_FSW_ZE
%elifdef ASM_CALL64_MSC
        mov     ecx, X86_FSW_ZE
%else
 %error calling conv.
%endif
        call    NAME(RT_NOCRT(feraiseexcept))

        ; Load +Inf
%ifdef RT_ARCH_AMD64
        movss   xmm0, [.s_r32MinusInf xWrtRIP]
%else
        fld     dword [.s_r32MinusInf xWrtRIP]
%endif
        jmp     .return

        ;
        ; -Inf: Same as other negative numbers
        ; +Inf: return +Inf.  Join path with NaN.
        ;
.inf:
        test    cx, X86_FSW_C1              ; sign bit
        jnz     .minus_inf

        ;
        ; NaN: Return the input NaN value as is, if we can.
        ;
.nan:
%ifdef RT_ARCH_AMD64
        ffreep  st0
%endif
        jmp     .return

ALIGNCODE(8)
        ;; The fyl2xp1 instruction only works between +/-1(1-sqrt(0.5)).
        ; These two variables is that range + 1.0, so we can compare directly
        ; with the input w/o any extra fsub and fabs work.
.s_r64AbsFyL2xP1InputMin:
        dq      0.708 ; -0.292 + 1.0
.s_r64AbsFyL2xP1InputMax:
        dq      1.292
.s_r32MinusInf:
        dd      RTFLOAT32U_INF_MINUS
.s_r32NaN:
        dd      RTFLOAT32U_QNAN_MINUS
ENDPROC   RT_NOCRT(log2f)