; $Id: ASMMultU64ByU32DivByU32.asm 98103 2023-01-17 14:15:46Z vboxsync $
;; @file
; IPRT - Assembly Functions, ASMMultU64ByU32DivByU32.
;

;
; Copyright (C) 2006-2023 Oracle and/or its affiliates.
;
; This file is part of VirtualBox base platform packages, as
; available from https://www.virtualbox.org.
;
; This program is free software; you can redistribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation, in version 3 of the
; License.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, see <https://www.gnu.org/licenses>.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
; in the VirtualBox distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;
; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
;

%include "iprt/asmdefs.mac"


;;
; Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
; using a 96 bit intermediate result.
;
; @returns (u64A * u32B) / u32C.
; @param   u64A/rcx/rdi     The 64-bit value.
; @param   u32B/edx/esi     The 32-bit value to multiple by A.
; @param   u32C/r8d/edx     The 32-bit value to divide A*B by.
;
; @cproto  DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
;
RT_BEGINPROC ASMMultU64ByU32DivByU32
%if ARCH_BITS == 64

 %ifdef ASM_CALL64_MSC
    mov     rax, rcx                    ; rax = u64A
    mov     r9d, edx                    ; should check the specs wrt to the high bits one day...
    mov     r8d, r8d                    ; be paranoid for the time being.
 %else
    mov     rax, rdi                    ; rax = u64A
    mov     r9d, esi                    ; r9d = u32B
    mov     r8d, edx                    ; r8d = u32C
 %endif
    mul     r9
    div     r8

%else ; 16 or 32 bit
    ;
    ; This implementation is converted from the GCC inline
    ; version of the code. Nothing additional has been done
    ; performance wise.
    ;
 %if ARCH_BITS == 16
    push    bp
    mov     bp, sp
    push    eax                         ; push all return registers to preserve high value (paranoia)
    push    ebx
    push    ecx
    push    edx
 %endif
    push    esi
    push    edi

 %if ARCH_BITS == 16
  %define u64A_Lo     [bp + 4 + 04h]
  %define u64A_Hi     [bp + 4 + 08h]
  %define u32B        [bp + 4 + 0ch]
  %define u32C        [bp + 4 + 10h]
 %else
  %define u64A_Lo     [esp + 04h + 08h]
  %define u64A_Hi     [esp + 08h + 08h]
  %define u32B        [esp + 0ch + 08h]
  %define u32C        [esp + 10h + 08h]
 %endif

    ; Load parameters into registers.
    mov     eax, u64A_Lo
    mov     esi, u64A_Hi
    mov     ecx, u32B
    mov     edi, u32C

    ; The body, just like the in
    mul     ecx                         ; eax = u64Lo.lo = (u64A.lo * u32B).lo
                                        ; edx = u64Lo.hi = (u64A.lo * u32B).hi
    xchg    eax, esi                    ; esi = u64Lo.lo
                                        ; eax = u64A.hi
    xchg    edx, edi                    ; edi = u64Low.hi
                                        ; edx = u32C
    xchg    edx, ecx                    ; ecx = u32C
                                        ; edx = u32B
    mul     edx                         ; eax = u64Hi.lo = (u64A.hi * u32B).lo
                                        ; edx = u64Hi.hi = (u64A.hi * u32B).hi
    add     eax, edi                    ; u64Hi.lo += u64Lo.hi
    adc     edx, 0                      ; u64Hi.hi += carry
    div     ecx                         ; eax = u64Hi / u32C
                                        ; edx = u64Hi % u32C
    mov     edi, eax                    ; edi = u64Result.hi = u64Hi / u32C
    mov     eax, esi                    ; eax = u64Lo.lo
    div     ecx                         ; u64Result.lo
    mov     edx, edi                    ; u64Result.hi

    ; epilogue
    pop     edi
    pop     esi
 %if ARCH_BITS == 16
    ;  DX:CX:BX:AX, where DX holds bits 15:0, CX bits 31:16, BX bits 47:32, and AX bits 63:48.
    mov     ax, [bp - 4*4]              ; dx = bits 15:0
    shr     eax, 16
    mov     ax, [bp - 3*4]              ; cx = bits 31:16
    mov     dx, [bp - 2*4]              ; bx = bits 47:32
    shr     edx, 16
    mov     dx, [bp - 1*4]              ; ax = bits 63:48
    pop     edx
    pop     ecx
    pop     ebx
    pop     eax
    leave
 %endif
%endif
    ret
ENDPROC ASMMultU64ByU32DivByU32