VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/math/bignum-amd64-x86.asm

Last change on this file was 98103, checked in by vboxsync, 16 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 27.0 KB
Line 
1; $Id: bignum-amd64-x86.asm 98103 2023-01-17 14:15:46Z vboxsync $
2;; @file
3; IPRT - Big Integer Numbers, AMD64 and X86 Assembly Workers
4;
5
6;
7; Copyright (C) 2006-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; The contents of this file may alternatively be used under the terms
26; of the Common Development and Distribution License Version 1.0
27; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28; in the VirtualBox distribution, in which case the provisions of the
29; CDDL are applicable instead of those of the GPL.
30;
31; You may elect to license modified versions of this file under the
32; terms and conditions of either the GPL or the CDDL or both.
33;
34; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35;
36
37
38;*********************************************************************************************************************************
39;* Header Files *
40;*********************************************************************************************************************************
41%define RT_ASM_WITH_SEH64
42%include "iprt/asmdefs.mac"
43%include "internal/bignum.mac"
44
45
46;*********************************************************************************************************************************
47;* Defined Constants And Macros *
48;*********************************************************************************************************************************
49%ifdef RT_ARCH_AMD64
50 %macro sahf 0
51 %error "SAHF not supported on ancient AMD64"
52 %endmacro
53 %macro lahf 0
54 %error "LAHF not supported on ancient AMD64"
55 %endmacro
56%endif
57
58
59BEGINCODE
60
61;;
62; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and
63; stores the result in pauResult.
64;
65; All three numbers are zero padded such that a borrow can be carried one (or
66; two for 64-bit) elements beyond the end of the largest number.
67;
68; @returns nothing.
69; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx
70; @param pauMinuend x86:[ebp + 12] gcc:rsi msc:rdx
71; @param pauSubtrahend x86:[ebp + 16] gcc:rdx msc:r8
72; @param cUsed x86:[ebp + 20] gcc:rcx msc:r9
73;
74BEGINPROC rtBigNumMagnitudeSubAssemblyWorker
75 push xBP
76 SEH64_PUSH_xBP
77 mov xBP, xSP
78 SEH64_SET_FRAME_xBP 0
79SEH64_END_PROLOGUE
80
81%ifdef RT_ARCH_AMD64
82 %ifdef ASM_CALL64_GCC
83 %define pauResult rdi
84 %define pauMinuend rsi
85 %define pauSubtrahend rdx
86 %define cUsed ecx
87 %else
88 %define pauResult rcx
89 %define pauMinuend rdx
90 %define pauSubtrahend r8
91 %define cUsed r9d
92 %endif
93 xor r11d, r11d ; index register.
94
95 %if RTBIGNUM_ELEMENT_SIZE == 4
96 add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2
97 shr cUsed, 1
98 %endif
99 cmp cUsed, 8 ; Skip the big loop if small number.
100 jb .small_job
101
102 mov r10d, cUsed
103 shr r10d, 3
104 clc
105.big_loop:
106 mov rax, [pauMinuend + r11]
107 sbb rax, [pauSubtrahend + r11]
108 mov [pauResult + r11], rax
109 mov rax, [pauMinuend + r11 + 8]
110 sbb rax, [pauSubtrahend + r11 + 8]
111 mov [pauResult + r11 + 8], rax
112 mov rax, [pauMinuend + r11 + 16]
113 sbb rax, [pauSubtrahend + r11 + 16]
114 mov [pauResult + r11 + 16], rax
115 mov rax, [pauMinuend + r11 + 24]
116 sbb rax, [pauSubtrahend + r11 + 24]
117 mov [pauResult + r11 + 24], rax
118 mov rax, [pauMinuend + r11 + 32]
119 sbb rax, [pauSubtrahend + r11 + 32]
120 mov [pauResult + r11 + 32], rax
121 mov rax, [pauMinuend + r11 + 40]
122 sbb rax, [pauSubtrahend + r11 + 40]
123 mov [pauResult + r11 + 40], rax
124 mov rax, [pauMinuend + r11 + 48]
125 sbb rax, [pauSubtrahend + r11 + 48]
126 mov [pauResult + r11 + 48], rax
127 mov rax, [pauMinuend + r11 + 56]
128 sbb rax, [pauSubtrahend + r11 + 56]
129 mov [pauResult + r11 + 56], rax
130 lea r11, [r11 + 64]
131 dec r10d ; Does not change CF.
132 jnz .big_loop
133
134 %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else.
135 lahf ; Save CF
136 and cUsed, 7 ; Up to seven odd rounds.
137 jz .done
138 sahf ; Restore CF.
139 jmp .small_loop ; Skip CF=1 (clc).
140 %else
141 jnc .no_carry
142 and cUsed, 7 ; Up to seven odd rounds.
143 jz .done
144 stc
145 jmp .small_loop ; Skip CF=1 (clc).
146.no_carry:
147 and cUsed, 7 ; Up to seven odd rounds.
148 jz .done
149 %endif
150.small_job:
151 clc
152.small_loop:
153 mov rax, [pauMinuend + r11]
154 sbb rax, [pauSubtrahend + r11]
155 mov [pauResult + r11], rax
156 lea r11, [r11 + 8]
157 dec cUsed ; does not change CF.
158 jnz .small_loop
159 %ifdef RT_STRICT
160 jnc .done
161 int3
162 %endif
163.done:
164
165%elifdef RT_ARCH_X86
166 push edi
167 push esi
168 push ebx
169
170 mov edi, [ebp + 08h] ; pauResult
171 %define pauResult edi
172 mov ecx, [ebp + 0ch] ; pauMinuend
173 %define pauMinuend ecx
174 mov edx, [ebp + 10h] ; pauSubtrahend
175 %define pauSubtrahend edx
176 mov esi, [ebp + 14h] ; cUsed
177 %define cUsed esi
178
179 xor ebx, ebx ; index register.
180
181 cmp cUsed, 8 ; Skip the big loop if small number.
182 jb .small_job
183
184 shr cUsed, 3
185 clc
186.big_loop:
187 mov eax, [pauMinuend + ebx]
188 sbb eax, [pauSubtrahend + ebx]
189 mov [pauResult + ebx], eax
190 mov eax, [pauMinuend + ebx + 4]
191 sbb eax, [pauSubtrahend + ebx + 4]
192 mov [pauResult + ebx + 4], eax
193 mov eax, [pauMinuend + ebx + 8]
194 sbb eax, [pauSubtrahend + ebx + 8]
195 mov [pauResult + ebx + 8], eax
196 mov eax, [pauMinuend + ebx + 12]
197 sbb eax, [pauSubtrahend + ebx + 12]
198 mov [pauResult + ebx + 12], eax
199 mov eax, [pauMinuend + ebx + 16]
200 sbb eax, [pauSubtrahend + ebx + 16]
201 mov [pauResult + ebx + 16], eax
202 mov eax, [pauMinuend + ebx + 20]
203 sbb eax, [pauSubtrahend + ebx + 20]
204 mov [pauResult + ebx + 20], eax
205 mov eax, [pauMinuend + ebx + 24]
206 sbb eax, [pauSubtrahend + ebx + 24]
207 mov [pauResult + ebx + 24], eax
208 mov eax, [pauMinuend + ebx + 28]
209 sbb eax, [pauSubtrahend + ebx + 28]
210 mov [pauResult + ebx + 28], eax
211 lea ebx, [ebx + 32]
212 dec cUsed ; Does not change CF.
213 jnz .big_loop
214
215 lahf ; Save CF
216 mov cUsed, [ebp + 14h] ; Up to three final rounds.
217 and cUsed, 7
218 jz .done
219 sahf ; Restore CF.
220 jmp .small_loop ; Skip CF=1 (clc).
221
222.small_job:
223 clc
224.small_loop:
225 mov eax, [pauMinuend + ebx]
226 sbb eax, [pauSubtrahend + ebx]
227 mov [pauResult + ebx], eax
228 lea ebx, [ebx + 4]
229 dec cUsed ; Does not change CF
230 jnz .small_loop
231 %ifdef RT_STRICT
232 jnc .done
233 int3
234 %endif
235.done:
236
237 pop ebx
238 pop esi
239 pop edi
240%else
241 %error "Unsupported arch"
242%endif
243
244 leave
245 ret
246%undef pauResult
247%undef pauMinuend
248%undef pauSubtrahend
249%undef cUsed
250ENDPROC rtBigNumMagnitudeSubAssemblyWorker
251
252
253
254;;
255; Subtracts a number (pauSubtrahend) from a larger number (pauMinuend) and
256; stores the result in pauResult.
257;
258; All three numbers are zero padded such that a borrow can be carried one (or
259; two for 64-bit) elements beyond the end of the largest number.
260;
261; @returns nothing.
262; @param pauResultMinuend x86:[ebp + 8] gcc:rdi msc:rcx
263; @param pauSubtrahend x86:[ebp + 12] gcc:rsi msc:rdx
264; @param cUsed x86:[ebp + 16] gcc:rdx msc:r8
265;
266BEGINPROC rtBigNumMagnitudeSubThisAssemblyWorker
267 push xBP
268 SEH64_PUSH_xBP
269 mov xBP, xSP
270 SEH64_SET_FRAME_xBP 0
271SEH64_END_PROLOGUE
272
273%ifdef RT_ARCH_AMD64
274 %ifdef ASM_CALL64_GCC
275 %define pauResultMinuend rdi
276 %define pauSubtrahend rsi
277 %define cUsed edx
278 %else
279 %define pauResultMinuend rcx
280 %define pauSubtrahend rdx
281 %define cUsed r8d
282 %endif
283 xor r11d, r11d ; index register.
284
285 %if RTBIGNUM_ELEMENT_SIZE == 4
286 add cUsed, 1 ; cUsed = RT_ALIGN(cUsed, 2) / 2
287 shr cUsed, 1
288 %endif
289 cmp cUsed, 8 ; Skip the big loop if small number.
290 jb .small_job
291
292 mov r10d, cUsed
293 shr r10d, 3
294 clc
295.big_loop:
296 mov rax, [pauSubtrahend + r11]
297 sbb [pauResultMinuend + r11], rax
298 mov rax, [pauSubtrahend + r11 + 8]
299 sbb [pauResultMinuend + r11 + 8], rax
300 mov rax, [pauSubtrahend + r11 + 16]
301 sbb [pauResultMinuend + r11 + 16], rax
302 mov rax, [pauSubtrahend + r11 + 24]
303 sbb [pauResultMinuend + r11 + 24], rax
304 mov rax, [pauSubtrahend + r11 + 32]
305 sbb [pauResultMinuend + r11 + 32], rax
306 mov rax, [pauSubtrahend + r11 + 40]
307 sbb [pauResultMinuend + r11 + 40], rax
308 mov rax, [pauSubtrahend + r11 + 48]
309 sbb [pauResultMinuend + r11 + 48], rax
310 mov rax, [pauSubtrahend + r11 + 56]
311 sbb [pauResultMinuend + r11 + 56], rax
312 lea r11, [r11 + 64]
313 dec r10d ; Does not change CF.
314 jnz .big_loop
315
316 %if 0 ; Ancient AMD CPUs does have lahf/sahf, thus the mess in the %else.
317 lahf ; Save CF
318 and cUsed, 7 ; Up to seven odd rounds.
319 jz .done
320 sahf ; Restore CF.
321 jmp .small_loop ; Skip CF=1 (clc).
322 %else
323 jnc .no_carry
324 and cUsed, 7 ; Up to seven odd rounds.
325 jz .done
326 stc
327 jmp .small_loop ; Skip CF=1 (clc).
328.no_carry:
329 and cUsed, 7 ; Up to seven odd rounds.
330 jz .done
331 %endif
332.small_job:
333 clc
334.small_loop:
335 mov rax, [pauSubtrahend + r11]
336 sbb [pauResultMinuend + r11], rax
337 lea r11, [r11 + 8]
338 dec cUsed ; does not change CF.
339 jnz .small_loop
340 %ifdef RT_STRICT
341 jnc .done
342 int3
343 %endif
344.done:
345
346%elifdef RT_ARCH_X86
347 push edi
348 push ebx
349
350 mov edi, [ebp + 08h] ; pauResultMinuend
351 %define pauResultMinuend edi
352 mov edx, [ebp + 0ch] ; pauSubtrahend
353 %define pauSubtrahend edx
354 mov ecx, [ebp + 10h] ; cUsed
355 %define cUsed ecx
356
357 xor ebx, ebx ; index register.
358
359 cmp cUsed, 8 ; Skip the big loop if small number.
360 jb .small_job
361
362 shr cUsed, 3
363 clc
364.big_loop:
365 mov eax, [pauSubtrahend + ebx]
366 sbb [pauResultMinuend + ebx], eax
367 mov eax, [pauSubtrahend + ebx + 4]
368 sbb [pauResultMinuend + ebx + 4], eax
369 mov eax, [pauSubtrahend + ebx + 8]
370 sbb [pauResultMinuend + ebx + 8], eax
371 mov eax, [pauSubtrahend + ebx + 12]
372 sbb [pauResultMinuend + ebx + 12], eax
373 mov eax, [pauSubtrahend + ebx + 16]
374 sbb [pauResultMinuend + ebx + 16], eax
375 mov eax, [pauSubtrahend + ebx + 20]
376 sbb [pauResultMinuend + ebx + 20], eax
377 mov eax, [pauSubtrahend + ebx + 24]
378 sbb [pauResultMinuend + ebx + 24], eax
379 mov eax, [pauSubtrahend + ebx + 28]
380 sbb [pauResultMinuend + ebx + 28], eax
381 lea ebx, [ebx + 32]
382 dec cUsed ; Does not change CF.
383 jnz .big_loop
384
385 lahf ; Save CF
386 mov cUsed, [ebp + 10h] ; Up to seven odd rounds.
387 and cUsed, 7
388 jz .done
389 sahf ; Restore CF.
390 jmp .small_loop ; Skip CF=1 (clc).
391
392.small_job:
393 clc
394.small_loop:
395 mov eax, [pauSubtrahend + ebx]
396 sbb [pauResultMinuend + ebx], eax
397 lea ebx, [ebx + 4]
398 dec cUsed ; Does not change CF
399 jnz .small_loop
400 %ifdef RT_STRICT
401 jnc .done
402 int3
403 %endif
404.done:
405
406 pop ebx
407 pop edi
408%else
409 %error "Unsupported arch"
410%endif
411
412 leave
413 ret
414ENDPROC rtBigNumMagnitudeSubThisAssemblyWorker
415
416
417;;
418; Shifts an element array one bit to the left, returning the final carry value.
419;
420; On 64-bit hosts the array is always zero padded to a multiple of 8 bytes, so
421; we can use 64-bit operand sizes even if the element type is 32-bit.
422;
423; @returns The final carry value.
424; @param pauElements x86:[ebp + 8] gcc:rdi msc:rcx
425; @param cUsed x86:[ebp + 12] gcc:rsi msc:rdx
426; @param uCarry x86:[ebp + 16] gcc:rdx msc:r8
427;
428BEGINPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker
429 push xBP
430 SEH64_PUSH_xBP
431 mov xBP, xSP
432 SEH64_SET_FRAME_xBP 0
433SEH64_END_PROLOGUE
434
435%ifdef RT_ARCH_AMD64
436 %ifdef ASM_CALL64_GCC
437 %define pauElements rdi
438 %define cUsed esi
439 %define uCarry edx
440 %else
441 %define pauElements rcx
442 %define cUsed edx
443 %define uCarry r8d
444 %endif
445%elifdef RT_ARCH_X86
446 %define pauElements ecx
447 mov pauElements, [ebp + 08h]
448 %define cUsed edx
449 mov cUsed, [ebp + 0ch]
450 %define uCarry eax
451 mov uCarry, [ebp + 10h]
452%else
453 %error "Unsupported arch."
454%endif
455 ; Lots to do?
456 cmp cUsed, 8
457 jae .big_loop_init
458
459 ; Check for empty array.
460 test cUsed, cUsed
461 jz .no_elements
462 jmp .small_loop_init
463
464 ; Big loop - 8 unrolled loop iterations.
465.big_loop_init:
466%ifdef RT_ARCH_AMD64
467 mov r11d, cUsed
468%endif
469 shr cUsed, 3
470 test uCarry, uCarry ; clear the carry flag
471 jz .big_loop
472 stc
473.big_loop:
474%if RTBIGNUM_ELEMENT_SIZE == 8
475 rcl qword [pauElements], 1
476 rcl qword [pauElements + 8], 1
477 rcl qword [pauElements + 16], 1
478 rcl qword [pauElements + 24], 1
479 rcl qword [pauElements + 32], 1
480 rcl qword [pauElements + 40], 1
481 rcl qword [pauElements + 48], 1
482 rcl qword [pauElements + 56], 1
483 lea pauElements, [pauElements + 64]
484%else
485 rcl dword [pauElements], 1
486 rcl dword [pauElements + 4], 1
487 rcl dword [pauElements + 8], 1
488 rcl dword [pauElements + 12], 1
489 rcl dword [pauElements + 16], 1
490 rcl dword [pauElements + 20], 1
491 rcl dword [pauElements + 24], 1
492 rcl dword [pauElements + 28], 1
493 lea pauElements, [pauElements + 32]
494%endif
495 dec cUsed
496 jnz .big_loop
497
498 ; More to do?
499 pushf ; save carry flag (uCarry no longer used on x86).
500%ifdef RT_ARCH_AMD64
501 mov cUsed, r11d
502%else
503 mov cUsed, [ebp + 0ch]
504%endif
505 and cUsed, 7
506 jz .restore_cf_and_return ; Jump if we're good and done.
507 popf ; Restore CF.
508 jmp .small_loop ; Deal with the odd rounds.
509.restore_cf_and_return:
510 popf
511 jmp .carry_to_eax
512
513 ; Small loop - One round at the time.
514.small_loop_init:
515 test uCarry, uCarry ; clear the carry flag
516 jz .small_loop
517 stc
518.small_loop:
519%if RTBIGNUM_ELEMENT_SIZE == 8
520 rcl qword [pauElements], 1
521 lea pauElements, [pauElements + 8]
522%else
523 rcl dword [pauElements], 1
524 lea pauElements, [pauElements + 4]
525%endif
526 dec cUsed
527 jnz .small_loop
528
529 ; Calculate return value.
530.carry_to_eax:
531 mov eax, 0
532 jnc .return
533 inc eax
534.return:
535 leave
536 ret
537
538.no_elements:
539 mov eax, uCarry
540 jmp .return
541ENDPROC rtBigNumMagnitudeShiftLeftOneAssemblyWorker
542
543
544;;
545; Performs a 128-bit by 64-bit division on 64-bit and
546; a 64-bit by 32-bit divison on 32-bit.
547;
548; @returns nothing.
549; @param puQuotient x86:[ebp + 8] gcc:rdi msc:rcx Double element.
550; @param puRemainder x86:[ebp + 12] gcc:rsi msc:rdx Normal element.
551; @param uDividendHi x86:[ebp + 16] gcc:rdx msc:r8
552; @param uDividendLo x86:[ebp + 20] gcc:rcx msc:r9
553; @param uDivisior x86:[ebp + 24] gcc:r8 msc:[rbp + 30h]
554;
555BEGINPROC rtBigNumElement2xDiv2xBy1x
556 push xBP
557 SEH64_PUSH_xBP
558 mov xBP, xSP
559 SEH64_SET_FRAME_xBP 0
560SEH64_END_PROLOGUE
561
562%ifdef RT_ARCH_AMD64
563 %if RTBIGNUM_ELEMENT_SIZE == 4
564 %error "sorry not implemented yet."
565 sorry not implemented yet.
566 %endif
567
568 %define uDividendHi rdx
569 %define uDividendLo rax
570 %ifdef ASM_CALL64_GCC
571 %define uDivisor r8
572 %define puQuotient rdi
573 %define puRemainder rsi
574 mov rax, rcx
575 %else
576 %define puQuotient rcx
577 %define puRemainder r11
578 %define uDivisor r10
579 mov r11, rdx
580 mov r10, [rbp + 30h]
581 mov rdx, r8
582 mov rax, r9
583 %endif
584
585%elifdef RT_ARCH_X86
586 push edi
587 push ebx
588
589 %define uDividendHi edx
590 mov uDividendHi, [ebp + 10h]
591 %define uDividendLo eax
592 mov uDividendLo, [ebp + 14h]
593 %define uDivisor ecx
594 mov uDivisor, [ebp + 18h]
595 %define puQuotient edi
596 mov puQuotient, [ebp + 08h]
597 %define puRemainder ebx
598 mov puRemainder, [ebp + 0ch]
599%else
600 %error "Unsupported arch."
601%endif
602
603%ifdef RT_STRICT
604 ;
605 ; The dividend shall not be zero.
606 ;
607 test uDivisor, uDivisor
608 jnz .divisor_not_zero
609 int3
610.divisor_not_zero:
611%endif
612
613 ;
614 ; Avoid division overflow. This will calculate the high part of the quotient.
615 ;
616 mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], 0
617 cmp uDividendHi, uDivisor
618 jb .do_divide
619 push xAX
620 mov xAX, xDX
621 xor edx, edx
622 div uDivisor
623 mov RTBIGNUM_ELEMENT_PRE [puQuotient + RTBIGNUM_ELEMENT_SIZE], xAX
624 pop xAX
625
626 ;
627 ; Perform the division and store the result.
628 ;
629.do_divide:
630 div uDivisor
631 mov RTBIGNUM_ELEMENT_PRE [puQuotient], xAX
632 mov RTBIGNUM_ELEMENT_PRE [puRemainder], xDX
633
634
635%ifdef RT_ARCH_X86
636 pop ebx
637 pop edi
638%endif
639 leave
640 ret
641ENDPROC rtBigNumElement2xDiv2xBy1x
642
643
644;;
645; Performs the core of long multiplication.
646;
647; @returns nothing.
648; @param pauResult x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero.
649; @param pauMultiplier x86:[ebp + 12] gcc:rsi msc:rdx
650; @param cMultiplier x86:[ebp + 16] gcc:rdx msc:r8
651; @param pauMultiplicand x86:[ebp + 20] gcc:rcx msc:r9
652; @param cMultiplicand x86:[ebp + 24] gcc:r8 msc:[rbp + 30h]
653;
654BEGINPROC rtBigNumMagnitudeMultiplyAssemblyWorker
655 push xBP
656 SEH64_PUSH_xBP
657 mov xBP, xSP
658 SEH64_SET_FRAME_xBP 0
659SEH64_END_PROLOGUE
660
661%ifdef RT_ARCH_AMD64
662 %if RTBIGNUM_ELEMENT_SIZE == 4
663 %error "sorry not implemented yet."
664 sorry not implemented yet.
665 %endif
666
667 %ifdef ASM_CALL64_GCC
668 %define pauResult rdi
669 %define pauMultiplier rsi
670 %define cMultiplier r9
671 %define pauMultiplicand rcx
672 %define cMultiplicand r8
673 mov r9d, edx ; cMultiplier
674 mov r8d, r8d ; cMultiplicand - paranoia
675 %define uMultiplier r10
676 %define iMultiplicand r11
677 %else
678 %define pauResult rcx
679 %define pauMultiplier r11
680 %define cMultiplier r8
681 %define pauMultiplicand r9
682 %define cMultiplicand r10
683 mov pauMultiplier, rdx
684 mov r10d, dword [rbp + 30h] ; cMultiplicand
685 mov r8d, r8d ; cMultiplier - paranoia
686 %define uMultiplier r12
687 push r12
688 %define iMultiplicand r13
689 push r13
690 %endif
691
692%elifdef RT_ARCH_X86
693 push edi
694 push esi
695 push ebx
696 sub esp, 10h
697 %define pauResult edi
698 mov pauResult, [ebp + 08h]
699 %define pauMultiplier dword [ebp + 0ch]
700 %define cMultiplier dword [ebp + 10h]
701 %define pauMultiplicand ecx
702 mov pauMultiplicand, [ebp + 14h]
703 %define cMultiplicand dword [ebp + 18h]
704 %define uMultiplier dword [ebp - 10h]
705 %define iMultiplicand ebx
706
707%else
708 %error "Unsupported arch."
709%endif
710
711 ;
712 ; Check that the multiplicand isn't empty (avoids an extra jump in the inner loop).
713 ;
714 cmp cMultiplicand, 0
715 je .done
716
717 ;
718 ; Loop thru each element in the multiplier.
719 ;
720 ; while (cMultiplier-- > 0)
721.multiplier_loop:
722 cmp cMultiplier, 0
723 jz .done
724 dec cMultiplier
725
726 ; uMultiplier = *pauMultiplier
727%ifdef RT_ARCH_X86
728 mov edx, pauMultiplier
729 mov eax, [edx]
730 mov uMultiplier, eax
731%else
732 mov uMultiplier, [pauMultiplier]
733%endif
734 ; for (iMultiplicand = 0; iMultiplicand < cMultiplicand; iMultiplicand++)
735 xor iMultiplicand, iMultiplicand
736.multiplicand_loop:
737 mov xAX, [pauMultiplicand + iMultiplicand * RTBIGNUM_ELEMENT_SIZE]
738 mul uMultiplier
739 add [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE], xAX
740 adc [pauResult + iMultiplicand * RTBIGNUM_ELEMENT_SIZE + RTBIGNUM_ELEMENT_SIZE], xDX
741 jnc .next_multiplicand
742 lea xDX, [iMultiplicand + 2]
743.next_adc:
744 adc RTBIGNUM_ELEMENT_PRE [pauResult + xDX * RTBIGNUM_ELEMENT_SIZE], 0
745 inc xDX
746 jc .next_adc
747
748.next_multiplicand:
749 inc iMultiplicand ; iMultiplicand++
750 cmp iMultiplicand, cMultiplicand ; iMultiplicand < cMultiplicand
751 jb .multiplicand_loop
752
753 ; Advance and loop on multiplier.
754 add pauMultiplier, RTBIGNUM_ELEMENT_SIZE
755 add pauResult, RTBIGNUM_ELEMENT_SIZE
756 jmp .multiplier_loop
757
758.done:
759
760%ifdef RT_ARCH_AMD64
761 %ifdef ASM_CALL64_GCC
762 %else
763 pop r13
764 pop r12
765 %endif
766%elifdef RT_ARCH_X86
767 add esp, 10h
768 pop ebx
769 pop esi
770 pop edi
771%endif
772 leave
773 ret
774ENDPROC rtBigNumMagnitudeMultiplyAssemblyWorker
775
776;;
777; Assembly implementation of the D4 step of Knuth's division algorithm.
778;
779; This subtracts Divisor * Qhat from the dividend at the current J index.
780;
781; @returns true if negative result (unlikely), false if positive.
782; @param pauDividendJ x86:[ebp + 8] gcc:rdi msc:rcx Initialized to zero.
783; @param pauDivisor x86:[ebp + 12] gcc:rsi msc:rdx
784; @param cDivisor x86:[ebp + 16] gcc:edx msc:r8d
785; @param uQhat x86:[ebp + 16] gcc:rcx msc:r9
786;
787BEGINPROC rtBigNumKnuthD4_MulSub
788 push xBP
789 SEH64_PUSH_xBP
790 mov xBP, xSP
791 SEH64_SET_FRAME_xBP 0
792SEH64_END_PROLOGUE
793
794%ifdef RT_ARCH_AMD64
795 %if RTBIGNUM_ELEMENT_SIZE == 4
796 %error "sorry not implemented yet."
797 sorry not implemented yet.
798 %endif
799
800 %ifdef ASM_CALL64_GCC
801 %define pauDividendJ rdi
802 %define pauDivisor rsi
803 %define cDivisor r8
804 %define uQhat rcx
805 mov r8d, edx ; cDivisor
806 %define uMulCarry r11
807 %else
808 %define pauDividendJ rcx
809 %define pauDivisor r10
810 %define cDivisor r8
811 %define uQhat r9
812 mov r10, rdx ; pauDivisor
813 mov r8d, r8d ; cDivisor - paranoia
814 %define uMulCarry r11
815 %endif
816
817%elifdef RT_ARCH_X86
818 push edi
819 push esi
820 push ebx
821 %define pauDividendJ edi
822 mov pauDividendJ, [ebp + 08h]
823 %define pauDivisor esi
824 mov pauDivisor, [ebp + 0ch]
825 %define cDivisor ecx
826 mov cDivisor, [ebp + 10h]
827 %define uQhat dword [ebp + 14h]
828 %define uMulCarry ebx
829%else
830 %error "Unsupported arch."
831%endif
832
833%ifdef RT_STRICT
834 ;
835 ; Some sanity checks.
836 ;
837 cmp cDivisor, 0
838 jne .cDivisor_not_zero
839 int3
840.cDivisor_not_zero:
841%endif
842
843 ;
844 ; Initialize the loop.
845 ;
846 xor uMulCarry, uMulCarry
847
848 ;
849 ; do ... while (cDivisor-- > 0);
850 ;
851.the_loop:
852 ; RTUInt128MulU64ByU64(&uSub, uQhat, pauDivisor[i]);
853 mov xAX, uQhat
854 mul RTBIGNUM_ELEMENT_PRE [pauDivisor]
855 ; RTUInt128AssignAddU64(&uSub, uMulCarry);
856 add xAX, uMulCarry
857 adc xDX, 0
858 mov uMulCarry, xDX
859 ; Subtract uSub.s.Lo+fCarry from pauDividendJ[i]
860 sub [pauDividendJ], xAX
861 adc uMulCarry, 0
862%ifdef RT_STRICT
863 jnc .uMulCarry_did_not_overflow
864 int3
865.uMulCarry_did_not_overflow:
866%endif
867
868 ; Advance.
869 add pauDividendJ, RTBIGNUM_ELEMENT_SIZE
870 add pauDivisor, RTBIGNUM_ELEMENT_SIZE
871 dec cDivisor
872 jnz .the_loop
873
874 ;
875 ; Final dividend element (no corresponding divisor element).
876 ;
877 sub [pauDividendJ], uMulCarry
878 sbb eax, eax
879 and eax, 1
880
881.done:
882%ifdef RT_ARCH_AMD64
883%elifdef RT_ARCH_X86
884 pop ebx
885 pop esi
886 pop edi
887%endif
888 leave
889 ret
890ENDPROC rtBigNumKnuthD4_MulSub
891
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use