;========================================================== ;Useful Routines ;========================================================== ; FloatMul_80 ; fpOP1_Times_fpOP2 ; DE_Times_C (if not using Karatsuba multiplication) ; Karatsuba32 ; Karatsuba64 ; BC_Times_DE ;========================================================== ;Special case multiplications ;========================================================== ;inf*inf = inf ;inf*0 = NAN ;inf*NAN = NAN ;inf*fin = inf ;0 *0 = 0 ;0 *NAN = NAN ;0 *fin = 0 ;NAN*NAN = NAN ;NAN*fin = NAN ;========================================================== FloatMul_80: ;Input: ; HL points to one number ; DE points to another call LoadFPOPs fpOP1_Mul_fpOP2: ld hl,(fpOP1+12) ld de,(fpOP2+12) ld a,h xor d push af res 7,d res 7,h ld a,h \ or a \ jr nz,$+6 \ or l \ jp z,casemul ld a,d \ or a \ jr nz,$+6 \ or e \ jp z,casemul2 pop af and 80h xor $C0 add hl,de add a,h ld h,a ld (fpOP3+12),hl ; now we multiply the 64-bit ints at fpOP1+2, fpOP2+2 #IF multiply == Karatsuba call Karatsuba64 #ELSE ld bc,(fpOP1+2) ld de,(fpOP2+2) call DE_Times_C ld (outp128),hl \ ld (outp128+2),a ld de,(fpOP2+4) call DE_Times_C+2 ld de,(fpOP4) \ ld d,b \ add hl,de \ adc a,b ld (outp128+2),hl \ ld (outp128+4),a ld de,(fpOP2+6) call DE_Times_C+2 ld de,(outp128+4) \ ld d,b \ add hl,de \ adc a,b ld (outp128+4),hl \ ld (outp128+6),a ld de,(fpOP2+8) call DE_Times_C+2 ld de,(outp128+8) \ ld d,b \ add hl,de \ adc a,b ld (outp128+8),hl \ ld (outp128+10),a #macro MulByte(x) ld bc,(fpOP1+2+x) ld de,(fpOP2+2) call DE_Times_C ld de,(outp128+x) \ add hl,de ld d,a \ ld a,(outp128+2+x) \ adc a,d ld (outp128+x),hl \ ld (outp128+2+x),a ld de,(fpOP2+4) call DE_Times_C+2 ld de,(outp128+2+x) \ add hl,de ld d,a \ ld a,(outp128+4+x) \ adc a,d ld (outp128+2+x),hl \ ld (outp128+4+x),a ld de,(fpOP2+6) call DE_Times_C+2 ld de,(outp128+4+x) \ add hl,de ld d,a \ ld a,(outp128+6+x) \ adc a,d ld (outp128+4+x),hl \ ld (outp128+6+x),a ld de,(fpOP2+8) call DE_Times_C+2 ld de,(outp128+6+x) \ add hl,de adc a,b ld (outp128+6+x),hl \ ld (outp128+8+x),a #endmacro MulByte(1) MulByte(2) MulByte(3) MulByte(4) MulByte(5) MulByte(6) MulByte(7) #ENDIF ld de,fpOP3 ld a,(outp128+15) \ rla ld hl,outp128+3 jr c,NoMulAdjust sla (hl) \ inc hl rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi rl (hl) \ ldi ld (de),a ret NoMulAdjust: inc hl ldi \ ldi ldi \ ldi ldi \ ldi ldi \ ldi ldi \ ldi ldi \ ldi ld hl,(fpOP3+12) \ inc hl \ ld (fpOP3+12),hl ret #IF multiply != Karatsuba DE_Times_C: ld b,0 ld h,b \ ld l,b \ ld a,c add a,a \ jr nc,$+4 \ ld h,d \ ld l,e add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ ret nc \ add hl,de \ adc a,b \ ret #ENDIF casemul: ;fpOP1 is inf/nan/0 ld a,(fpOP1+11) add a,a \ jp nc,zeromul jp p,SetfpOP3_NAN-1 ;infmul ld a,d \ or a \ jp nz,divoverflow or e \ jp nz,divoverflow ld a,(fpOP2+11) add a,a \ jp nc,SetfpOP3_NAN-1 jp p,SetfpOP3_NAN-1 jp divoverflow zeromul: ld a,d \ or a \ jp nz,divunderflow or e \ jp nz,divunderflow ld a,(fpOP2+11) add a,a \ jp nc,divunderflow jp SetfpOP3_NAN-1 casemul2: ;finite times inf/nan/0 ld a,(fpOP2+11) add a,a \ jp nc,divunderflow jp p,SetfpOP3_NAN-1 jp divoverflow .echo "FloatMul Size:",$-FloatMul_80 #IF multiply == Karatsuba Karatsuba64: ;Input: (word64_1),(word64_2) ;Output: (outp128) ;11039 t-states worst ld hl,(word64_1) \ ld (word32_1),hl ld hl,(word64_1+2) \ ld (word32_1+2),hl ld hl,(word64_2) \ ld (word32_2),hl ld hl,(word64_2+2) \ ld (word32_2+2),hl call KaratsubaMul32 ld hl,(outp64) \ ld (outp128),hl ld hl,(outp64+2) \ ld (outp128+2),hl ld hl,(outp64+4) \ ld (outp128+4),hl ld hl,(outp64+6) \ ld (outp128+6),hl ld hl,(word64_1+4) \ ld (word32_1),hl ld hl,(word64_1+6) \ ld (word32_1+2),hl ld hl,(word64_2+4) \ ld (word32_2),hl ld hl,(word64_2+6) \ ld (word32_2+2),hl call KaratsubaMul32 ld hl,(outp64) \ ld (outp128+8),hl ld hl,(outp64+2) \ ld (outp128+10),hl ld hl,(outp64+4) \ ld (outp128+12),hl ld hl,(outp64+6) \ ld (outp128+14),hl ;512+2*Karatsuba xor a ld hl,(word64_1) ld de,(word64_1+4) add hl,de ld (word32_1),hl ld hl,(word64_1+2) ld de,(word64_1+6) adc hl,de ld (word32_1+2),hl rla ld hl,(word64_2) ld de,(word64_2+4) add hl,de ld (word32_2),hl ld hl,(word64_2+2) ld de,(word64_2+6) adc hl,de ld (word32_2+2),hl push af call KaratsubaMul32 ;790+3*Karatsuba pop af ld c,a ld a,0 jr nc,Addmore_2 ld a,c ld bc,(word64_1) ld hl,(word64_1+4) add hl,bc ex de,hl ld bc,(word64_1+2) ld hl,(word64_1+6) adc hl,bc ex de,hl ld bc,(outp64+4) add hl,bc ld (outp64+4),hl ex de,hl ld bc,(outp64+6) adc hl,bc ld (outp64+6),hl ld c,a ld a,0 adc a,c Addmore_2: rr c jr nc,label_2 ld bc,(word64_2) ld hl,(word64_2+4) add hl,bc ex de,hl ld bc,(word64_2+2) ld hl,(word64_2+6) adc hl,bc ex de,hl ld bc,(outp64+4) add hl,bc ld (outp64+4),hl ex de,hl ld bc,(outp64+6) adc hl,bc ld (outp64+6),hl label_2: ld d,0 \ adc a,d ;(outp64) - (outp128) - (outp128+8) ld hl,(outp64) ld bc,(outp128) sbc hl,bc ld (outp64),hl ld hl,(outp64+2) ld bc,(outp128+2) sbc hl,bc ld (outp64+2),hl ld hl,(outp64+4) ld bc,(outp128+4) sbc hl,bc ld (outp64+4),hl ld hl,(outp64+6) ld bc,(outp128+6) sbc hl,bc ld (outp64+6),hl sbc a,d ld hl,(outp64) ld bc,(outp128+8) sbc hl,bc ld (outp64),hl ld hl,(outp64+2) ld bc,(outp128+10) sbc hl,bc ld (outp64+2),hl ld hl,(outp64+4) ld bc,(outp128+12) sbc hl,bc ld (outp64+4),hl ld hl,(outp64+6) ld bc,(outp128+14) sbc hl,bc ld (outp64+6),hl sbc a,d ;(outp64) + (outp128+4) ld hl,(outp64) ld bc,(outp128+4) add hl,bc ld (outp128+4),hl ld hl,(outp64+2) ld bc,(outp128+6) adc hl,bc ld (outp128+6),hl ld hl,(outp64+4) ld bc,(outp128+8) adc hl,bc ld (outp128+8),hl ld hl,(outp64+6) ld bc,(outp128+10) adc hl,bc ld (outp128+10),hl ld hl,(outp128+12) ld e,a adc hl,de ld (outp128+12),hl ret nc ld hl,(outp128+14) inc hl ld (outp128+14),hl ret .echo " Karatsuba64:",$-Karatsuba64 KaratsubaMul32: ;Input: (word32_1), (word32_2) ;Output: (outp64) ;2931 t-states worst case, 2345 best ;Previous best optimized: 3666 t-states worst case, 2880 lower bound ;has a bug. For example, in pi*e, multiplying the upper 32 bits of each ends in A189, but should be a288 ld bc,(word32_1) ld de,(word32_2) call BC_Times_DE ld c,h ld h,l ld l,a ld (outp),hl ld (outp+2),bc ld bc,(word32_1+2) ld de,(word32_2+2) call BC_Times_DE ld c,h ld h,l ld l,a ld (outp+4),hl ld (outp+6),bc xor a ld hl,(word32_1) ld bc,(word32_1+2) add hl,bc rla ex de,hl ld hl,(word32_2) ld bc,(word32_2+2) add hl,bc ld b,h ld c,l push af call BC_Times_DE ld e,h ld d,b ld h,l ld l,a ;DEHL pop af push hl ld c,a ld a,0 jr nc,Addmore_1 ;(ax+b)(cx+d) = acx^2+axd+bcx+bd ;c flag is c ; ;x=2^16 ;a,c are 0 or 1 ; If a = 1, add c to A (A is the overflow thing), add (word32_2)+(word32_2+2) to DE ; If c = 1, add b to DE ld a,c ld bc,(word32_1) ld hl,(word32_1+2) add hl,bc add hl,de ex de,hl ld c,a ld a,0 adc a,c Addmore_1: rr c jr nc,label_1 ;if bit 7 is set, A =2, else A=0 ld bc,(word32_2) ld hl,(word32_2+2) add hl,bc add hl,de ex de,hl adc a,0 label_1: pop hl ;ADEHL - (outp) - (outp+4) ld bc,(outp) ; or a sbc hl,bc ex de,hl ;HLDE ld bc,(outp+2) sbc hl,bc \ sbc a,0 ex de,hl ;DEHL ld bc,(outp+4) sbc hl,bc ex de,hl ;HLDE ld bc,(outp+6) sbc hl,bc \ sbc a,0 ex de,hl ;DEHL + (outp+2) ld bc,(outp+2) add hl,bc ld (outp+2),hl ex de,hl ;HLDE ld bc,(outp+4) ld de,(outp+6) adc hl,bc ld (outp+4),hl ld h,0 \ ld l,a adc hl,de ld (outp+6),hl ret .echo " Karatsuba32:",$-KaratsubaMul32 BCM: BC_Times_DE: ;BHLA is the result ;<700 t-states worst case ;508 t-states best case ld a,d or a ld a,e ld hl,0 jp z,BC_Times_E or a jr z,BC_Times_D ld a,b or a \ jr z,C_Times_DE ld b,h add a,a \ jr nc,$+5 \ ld h,d \ ld l,e add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,b C_Times_DE: ld b,a ld a,c or a \ ret z push hl ld hl,0 ld c,h add a,a \ jr nc,$+5 \ ld h,d \ ld l,e add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c add hl,hl \ rla \ jr nc,$+4 \ add hl,de \ adc a,c pop de ld c,a \ ld a,l \ ld l,h \ ld h,c add hl,de \ ret nc inc b \ ret BC_Times_D: or b \ jr z,C_Times_D ld a,c \ or a \ jr z,B_Times_D ld a,d add a,a \ jr nc,$+5 \ ld h,b \ ld l,c add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,e ld b,a \ xor a \ ret B_Times_D: ld h,b \ ld l,c \ ld e,d \ ld d,l sla h \ jr nc,$+3 \ ld l,e add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de ld b,h \ ld h,l \ ld l,a \ ret C_Times_D: or c \ ret z ld a,d ld e,c ld d,b jp B_Times_E BC_Times_E: or a \ jr nz,$+4 \ ld b,h \ ret ld a,c \ or a \ ld a,b \ jr z,B_Times_E or a \ jr z,C_Times_E ;Actually BC_Times_E ld a,e add a,a \ jr nc,$+5 \ ld h,b \ ld l,c add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d add hl,hl \ rla \ jr nc,$+4 \ add hl,bc \ adc a,d ld b,a \ ld a,l \ ld l,h \ ld h,b \ ld b,d \ ret B_Times_E: add a,a \ ld h,a \ jr nc,$+3 \ ld l,e xor a \ ld b,a add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ ret nc \ add hl,de \ ret C_Times_E: ld h,c \ sla h \ jr nc,$+3 \ ld l,e add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de add hl,hl \ jr nc,$+3 \ add hl,de ld a,l \ ld l,h \ ld h,b \ ret .echo " Base Mult :",$-BCM #ENDIF