1  /** @file


2  * IPRT  Assembly Routines for Optimizing some Integers Math Operations.


3  */


4 


5  /*


6  * Copyright (C) 20062023 Oracle and/or its affiliates.


7  *


8  * This file is part of VirtualBox base platform packages, as


9  * available from https://www.virtualbox.org.


10  *


11  * This program is free software; you can redistribute it and/or


12  * modify it under the terms of the GNU General Public License


13  * as published by the Free Software Foundation, in version 3 of the


14  * License.


15  *


16  * This program is distributed in the hope that it will be useful, but


17  * WITHOUT ANY WARRANTY; without even the implied warranty of


18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU


19  * General Public License for more details.


20  *


21  * You should have received a copy of the GNU General Public License


22  * along with this program; if not, see <https://www.gnu.org/licenses>.


23  *


24  * The contents of this file may alternatively be used under the terms


25  * of the Common Development and Distribution License Version 1.0


26  * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included


27  * in the VirtualBox distribution, in which case the provisions of the


28  * CDDL are applicable instead of those of the GPL.


29  *


30  * You may elect to license modified versions of this file under the


31  * terms and conditions of either the GPL or the CDDL or both.


32  *


33  * SPDXLicenseIdentifier: GPL3.0only OR CDDL1.0


34  */


35 


36  #ifndef IPRT_INCLUDED_asm_math_h


37  #define IPRT_INCLUDED_asm_math_h


38  #ifndef RT_WITHOUT_PRAGMA_ONCE


39  # pragma once


40  #endif


41 


42  #include <iprt/types.h>


43 


44  #if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN


45  /* Emit the intrinsics at all optimization levels. */


46  # include <iprt/sanitized/intrin.h>


47  # pragma intrinsic(__emul)


48  # pragma intrinsic(__emulu)


49  # ifdef RT_ARCH_AMD64


50  # pragma intrinsic(_mul128)


51  # pragma intrinsic(_umul128)


52  # endif


53  #endif


54 


55 


56  /** @defgroup grp_rt_asm_math Interger Math Optimizations


57  * @ingroup grp_rt_asm


58  * @{ */


59 


60  /**


61  * Multiplies two unsigned 32bit values returning an unsigned 64bit result.


62  *


63  * @returns u32F1 * u32F2.


64  */


65 


66  #if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)


67  DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);


68  #else


69  DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)


70  {


71  # ifdef RT_ARCH_X86


72  uint64_t u64;


73  # if RT_INLINE_ASM_GNU_STYLE


74  __asm__ __volatile__("mull %%edx"


75  : "=A" (u64)


76  : "a" (u32F2), "d" (u32F1));


77  # elif RT_INLINE_ASM_USES_INTRIN


78  u64 = __emulu(u32F1, u32F2);


79  # else


80  __asm


81  {


82  mov edx, [u32F1]


83  mov eax, [u32F2]


84  mul edx


85  mov dword ptr [u64], eax


86  mov dword ptr [u64 + 4], edx


87  }


88  # endif


89  return u64;


90  # else /* generic: */


91  return (uint64_t)u32F1 * u32F2;


92  # endif


93  }


94  #endif


95 


96 


97  /**


98  * Multiplies two signed 32bit values returning a signed 64bit result.


99  *


100  * @returns u32F1 * u32F2.


101  */


102  #if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)


103  DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);


104  #else


105  DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)


106  {


107  # ifdef RT_ARCH_X86


108  int64_t i64;


109  # if RT_INLINE_ASM_GNU_STYLE


110  __asm__ __volatile__("imull %%edx"


111  : "=A" (i64)


112  : "a" (i32F2), "d" (i32F1));


113  # elif RT_INLINE_ASM_USES_INTRIN


114  i64 = __emul(i32F1, i32F2);


115  # else


116  __asm


117  {


118  mov edx, [i32F1]


119  mov eax, [i32F2]


120  imul edx


121  mov dword ptr [i64], eax


122  mov dword ptr [i64 + 4], edx


123  }


124  # endif


125  return i64;


126  # else /* generic: */


127  return (int64_t)i32F1 * i32F2;


128  # endif


129  }


130  #endif


131 


132 


133  DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)


134  {


135  #if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE  RT_INLINE_ASM_USES_INTRIN)


136  # if RT_INLINE_ASM_GNU_STYLE


137  uint64_t u64Low, u64High;


138  __asm__ __volatile__("mulq %%rdx"


139  : "=a" (u64Low), "=d" (u64High)


140  : "0" (u64F1), "1" (u64F2));


141  *pu64ProdHi = u64High;


142  return u64Low;


143  # elif RT_INLINE_ASM_USES_INTRIN


144  return _umul128(u64F1, u64F2, pu64ProdHi);


145  # else


146  # error "hmm"


147  # endif


148  #else /* generic: */


149  /*


150  * F1 * F2 = Prod


151  *  


152  * ab * cd = b*d + a*d*10 + b*c*10 + a*c*100


153  *


154  * Where a, b, c and d are 'digits', and 10 is max digit + 1.


155  *


156  * Our digits are 32bit wide, so instead of 10 we multiply by 4G.


157  * Prod = F1.s.Lo*F2.s.Lo + F1.s.Hi*F2.s.Lo*4G


158  * + F1.s.Lo*F2.s.Hi*4G + F1.s.Hi*F2.s.Hi*4G*4G


159  */


160  RTUINT128U Prod;


161  RTUINT64U Tmp1;


162  uint64_t u64Tmp;


163  RTUINT64U F1, F2;


164  F1.u = u64F1;


165  F2.u = u64F2;


166 


167  Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);


168 


169  Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);


170  u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;


171  Prod.DWords.dw1 = (uint32_t)u64Tmp;


172  Prod.s.Hi = Tmp1.s.Hi;


173  Prod.s.Hi += u64Tmp >> 32; /* carry */


174 


175  Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);


176  u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;


177  Prod.DWords.dw1 = (uint32_t)u64Tmp;


178  u64Tmp >>= 32; /* carry */


179  u64Tmp += Prod.DWords.dw2;


180  u64Tmp += Tmp1.s.Hi;


181  Prod.DWords.dw2 = (uint32_t)u64Tmp;


182  Prod.DWords.dw3 += u64Tmp >> 32; /* carry */


183 


184  Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);


185  *pu64ProdHi = Prod.s.Hi;


186  return Prod.s.Lo;


187  #endif


188  }


189 


190 


191 


192  /**


193  * Divides a 64bit unsigned by a 32bit unsigned returning an unsigned 32bit result.


194  *


195  * @returns u64 / u32.


196  */


197  #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)


198  DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);


199  #else


200  DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)


201  {


202  # ifdef RT_ARCH_X86


203  # if RT_INLINE_ASM_GNU_STYLE


204  RTCCUINTREG uDummy;


205  __asm__ __volatile__("divl %3"


206  : "=a" (u32), "=d"(uDummy)


207  : "A" (u64), "r" (u32));


208  # else


209  __asm


210  {


211  mov eax, dword ptr [u64]


212  mov edx, dword ptr [u64 + 4]


213  mov ecx, [u32]


214  div ecx


215  mov [u32], eax


216  }


217  # endif


218  return u32;


219  # else /* generic: */


220  return (uint32_t)(u64 / u32);


221  # endif


222  }


223  #endif


224 


225 


226  /**


227  * Divides a 64bit signed by a 32bit signed returning a signed 32bit result.


228  *


229  * @returns u64 / u32.


230  */


231  #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)


232  DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);


233  #else


234  DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)


235  {


236  # ifdef RT_ARCH_X86


237  # if RT_INLINE_ASM_GNU_STYLE


238  RTCCUINTREG iDummy;


239  __asm__ __volatile__("idivl %3"


240  : "=a" (i32), "=d"(iDummy)


241  : "A" (i64), "r" (i32));


242  # else


243  __asm


244  {


245  mov eax, dword ptr [i64]


246  mov edx, dword ptr [i64 + 4]


247  mov ecx, [i32]


248  idiv ecx


249  mov [i32], eax


250  }


251  # endif


252  return i32;


253  # else /* generic: */


254  return (int32_t)(i64 / i32);


255  # endif


256  }


257  #endif


258 


259 


260  /**


261  * Performs 64bit unsigned by a 32bit unsigned division with a 32bit unsigned result,


262  * returning the rest.


263  *


264  * @returns u64 % u32.


265  *


266  * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.


267  */


268  #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)


269  DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);


270  #else


271  DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)


272  {


273  # ifdef RT_ARCH_X86


274  # if RT_INLINE_ASM_GNU_STYLE


275  RTCCUINTREG uDummy;


276  __asm__ __volatile__("divl %3"


277  : "=a" (uDummy), "=d"(u32)


278  : "A" (u64), "r" (u32));


279  # else


280  __asm


281  {


282  mov eax, dword ptr [u64]


283  mov edx, dword ptr [u64 + 4]


284  mov ecx, [u32]


285  div ecx


286  mov [u32], edx


287  }


288  # endif


289  return u32;


290  # else /* generic: */


291  return (uint32_t)(u64 % u32);


292  # endif


293  }


294  #endif


295 


296 


297  /**


298  * Performs 64bit signed by a 32bit signed division with a 32bit signed result,


299  * returning the rest.


300  *


301  * @returns u64 % u32.


302  *


303  * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.


304  */


305  #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)


306  DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);


307  #else


308  DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)


309  {


310  # ifdef RT_ARCH_X86


311  # if RT_INLINE_ASM_GNU_STYLE


312  RTCCUINTREG iDummy;


313  __asm__ __volatile__("idivl %3"


314  : "=a" (iDummy), "=d"(i32)


315  : "A" (i64), "r" (i32));


316  # else


317  __asm


318  {


319  mov eax, dword ptr [i64]


320  mov edx, dword ptr [i64 + 4]


321  mov ecx, [i32]


322  idiv ecx


323  mov [i32], edx


324  }


325  # endif


326  return i32;


327  # else /* generic: */


328  return (int32_t)(i64 % i32);


329  # endif


330  }


331  #endif


332 


333 


334  /**


335  * Multiple a 32bit by a 32bit integer and divide the result by a 32bit integer


336  * using a 64 bit intermediate result.


337  *


338  * @returns (u32A * u32B) / u32C.


339  * @param u32A The 32bit value (A).


340  * @param u32B The 32bit value to multiple by A.


341  * @param u32C The 32bit value to divide A*B by.


342  *


343  * @remarks Architecture specific.


344  * @remarks Make sure the result won't ever exceed 32bit, because hardware


345  * exception may be raised if it does.


346  * @remarks On x86 this may be used to avoid dragging in 64bit builtin


347  * arithmetics functions.


348  */


349  #if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64)  defined(RT_ARCH_X86))


350  DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);


351  #else


352  DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)


353  {


354  # if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64)  defined(RT_ARCH_X86))


355  uint32_t u32Result, u32Spill;


356  __asm__ __volatile__("mull %2\n\t"


357  "divl %3\n\t"


358  : "=&a" (u32Result),


359  "=&d" (u32Spill)


360  : "r" (u32B),


361  "r" (u32C),


362  "0" (u32A));


363  return u32Result;


364  # else


365  return (uint32_t)(((uint64_t)u32A * u32B) / u32C);


366  # endif


367  }


368  #endif


369 


370 


371  /**


372  * Multiple a 64bit by a 32bit integer and divide the result by a 32bit integer


373  * using a 96 bit intermediate result.


374  *


375  * @returns (u64A * u32B) / u32C.


376  * @param u64A The 64bit value.


377  * @param u32B The 32bit value to multiple by A.


378  * @param u32C The 32bit value to divide A*B by.


379  *


380  * @remarks Architecture specific.


381  * @remarks Make sure the result won't ever exceed 64bit, because hardware


382  * exception may be raised if it does.


383  * @remarks On x86 this may be used to avoid dragging in 64bit builtin


384  * arithmetics function.


385  */


386  #if RT_INLINE_ASM_EXTERNAL  !defined(__GNUC__)  (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))


387  DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);


388  #else


389  DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)


390  {


391  # if RT_INLINE_ASM_GNU_STYLE


392  # ifdef RT_ARCH_AMD64


393  uint64_t u64Result, u64Spill;


394  __asm__ __volatile__("mulq %2\n\t"


395  "divq %3\n\t"


396  : "=&a" (u64Result),


397  "=&d" (u64Spill)


398  : "r" ((uint64_t)u32B),


399  "r" ((uint64_t)u32C),


400  "0" (u64A));


401  return u64Result;


402  # else


403  uint32_t u32Dummy;


404  uint64_t u64Result;


405  __asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo


406  edx = u64Lo.hi = (u64A.lo * u32B).hi */


407  "xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo


408  eax = u64A.hi */


409  "xchg %%edx,%%edi \n\t" /* edi = u64Low.hi


410  edx = u32C */


411  "xchg %%edx,%%ecx \n\t" /* ecx = u32C


412  edx = u32B */


413  "mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo


414  edx = u64Hi.hi = (u64A.hi * u32B).hi */


415  "addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */


416  "adcl $0,%%edx \n\t" /* u64Hi.hi += carry */


417  "divl %%ecx \n\t" /* eax = u64Hi / u32C


418  edx = u64Hi % u32C */


419  "movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */


420  "movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */


421  "divl %%ecx \n\t" /* u64Result.lo */


422  "movl %%edi,%%edx \n\t" /* u64Result.hi */


423  : "=A"(u64Result), "=c"(u32Dummy),


424  "=S"(u32Dummy), "=D"(u32Dummy)


425  : "a"((uint32_t)u64A),


426  "S"((uint32_t)(u64A >> 32)),


427  "c"(u32B),


428  "D"(u32C));


429  return u64Result;


430  # endif


431  # else


432  RTUINT64U u;


433  uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;


434  uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;


435  u64Hi += (u64Lo >> 32);


436  u.s.Hi = (uint32_t)(u64Hi / u32C);


437  u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);


438  return u.u;


439  # endif


440  }


441  #endif


442 


443  /** @} */


444  #endif /* !IPRT_INCLUDED_asm_math_h */


445 

