diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c index 1e9421bee2..29293bad55 100644 --- a/crypto/bn/bn_ppc.c +++ b/crypto/bn/bn_ppc.c @@ -41,12 +41,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, */ #if defined(_ARCH_PPC64) && !defined(__ILP32__) + /* Minerva side-channel fix danny */ +# if defined(USE_FIXED_N6) if (num == 6) { if (OPENSSL_ppccap_P & PPC_MADD300) return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); else return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); } +# endif #endif return bn_mul_mont_int(rp, ap, bp, np, n0, num); diff --git a/crypto/ec/asm/ecp_nistp384-ppc64.pl b/crypto/ec/asm/ecp_nistp384-ppc64.pl index 28f4168e52..b663bddfc6 100755 --- a/crypto/ec/asm/ecp_nistp384-ppc64.pl +++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl @@ -7,13 +7,15 @@ # https://www.openssl.org/source/license.html # # ==================================================================== -# Written by Rohan McLure for the OpenSSL -# project. +# Written by Danny Tsen # for the OpenSSL project. +# +# Copyright 2025- IBM Corp. # ==================================================================== # -# p384 lower-level primitives for PPC64 using vector instructions. +# p384 lower-level primitives for PPC64. # + use strict; use warnings; @@ -21,7 +23,7 @@ my $flavour = shift; my $output = ""; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} if (!$output) { - $output = "-"; + $output = "-"; } my ($xlate, $dir); @@ -35,272 +37,1496 @@ open OUT,"| \"$^X\" $xlate $flavour $output"; my $code = ""; -my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); - -my $vzero = "v32"; - -sub startproc($) -{ - my ($name) = @_; - - $code.=<<___; - .globl ${name} - .align 5 -${name}: - -___ -} - -sub endproc($) -{ - my ($name) = @_; - - $code.=<<___; - blr - .size ${name},.-${name} - -___ -} - -sub load_vrs($$) -{ - my ($pointer, $reg_list) = @_; - - for (my $i = 0; $i <= 6; $i++) { - my $offset = $i * 8; - $code.=<<___; - lxsd $reg_list->[$i],$offset($pointer) -___ - } - - $code.=<<___; - -___ -} - -sub store_vrs($$) -{ - my ($pointer, $reg_list) = @_; - - for (my $i = 0; $i <= 12; $i++) { - my $offset = $i * 16; - $code.=<<___; - stxv $reg_list->[$i],$offset($pointer) -___ - } - - $code.=<<___; - -___ -} - $code.=<<___; -.machine "any" +.machine "any" .text +.globl p384_felem_mul +.type p384_felem_mul,\@function +.align 4 +p384_felem_mul: + + stdu 1, -176(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + + bl _p384_felem_mul_core + + mtlr 0 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + addi 1, 1, 176 + blr +.size p384_felem_mul,.-p384_felem_mul + +.globl p384_felem_square +.type p384_felem_square,\@function +.align 4 +p384_felem_square: + + stdu 1, -176(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + + bl _p384_felem_square_core + + mtlr 0 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + addi 1, 1, 176 + blr +.size p384_felem_square,.-p384_felem_square + +# +# Felem mul core function - +# r3, r4 and r5 need to pre-loaded. +# +.type _p384_felem_mul_core,\@function +.align 4 +_p384_felem_mul_core: + + ld 6,0(4) + ld 14,0(5) + ld 7,8(4) + ld 15,8(5) + ld 8,16(4) + ld 16,16(5) + ld 9,24(4) + ld 17,24(5) + ld 10,32(4) + ld 18,32(5) + ld 11,40(4) + ld 19,40(5) + ld 12,48(4) + ld 20,48(5) + + # out0 + mulld 21, 14, 6 + mulhdu 22, 14, 6 + std 21, 0(3) + std 22, 8(3) + + vxor 0, 0, 0 + + # out1 + mtvsrdd 32+13, 14, 6 + mtvsrdd 32+14, 7, 15 + vmsumudm 1, 13, 14, 0 + + # out2 + mtvsrdd 32+15, 15, 6 + mtvsrdd 32+16, 7, 16 + mtvsrdd 32+17, 0, 8 + mtvsrdd 32+18, 0, 14 + vmsumudm 19, 15, 16, 0 + vmsumudm 2, 17, 18, 19 + + # out3 + mtvsrdd 32+13, 16, 6 + mtvsrdd 32+14, 7, 17 + mtvsrdd 32+15, 14, 8 + mtvsrdd 32+16, 9, 15 + vmsumudm 19, 13, 14, 0 + vmsumudm 3, 15, 16, 19 + + # out4 + mtvsrdd 32+13, 17, 6 + mtvsrdd 32+14, 7, 18 + mtvsrdd 32+15, 15, 8 + mtvsrdd 32+16, 9, 16 + mtvsrdd 32+17, 0, 10 + mtvsrdd 32+18, 0, 14 + vmsumudm 19, 13, 14, 0 + vmsumudm 4, 15, 16, 19 + vmsumudm 4, 17, 18, 4 + + # out5 + mtvsrdd 32+13, 18, 6 + mtvsrdd 32+14, 7, 19 + mtvsrdd 32+15, 16, 8 + mtvsrdd 32+16, 9, 17 + mtvsrdd 32+17, 14, 10 + mtvsrdd 32+18, 11, 15 + vmsumudm 19, 13, 14, 0 + vmsumudm 5, 15, 16, 19 + vmsumudm 5, 17, 18, 5 + + stxv 32+1, 16(3) + stxv 32+2, 32(3) + stxv 32+3, 48(3) + stxv 32+4, 64(3) + stxv 32+5, 80(3) + + # out6 + mtvsrdd 32+13, 19, 6 + mtvsrdd 32+14, 7, 20 + mtvsrdd 32+15, 17, 8 + mtvsrdd 32+16, 9, 18 + mtvsrdd 32+17, 15, 10 + mtvsrdd 32+18, 11, 16 + vmsumudm 19, 13, 14, 0 + vmsumudm 6, 15, 16, 19 + mtvsrdd 32+13, 0, 12 + mtvsrdd 32+14, 0, 14 + vmsumudm 19, 17, 18, 6 + vmsumudm 6, 13, 14, 19 + + # out7 + mtvsrdd 32+13, 19, 7 + mtvsrdd 32+14, 8, 20 + mtvsrdd 32+15, 17, 9 + mtvsrdd 32+16, 10, 18 + mtvsrdd 32+17, 15, 11 + mtvsrdd 32+18, 12, 16 + vmsumudm 19, 13, 14, 0 + vmsumudm 7, 15, 16, 19 + vmsumudm 7, 17, 18, 7 + + # out8 + mtvsrdd 32+13, 19, 8 + mtvsrdd 32+14, 9, 20 + mtvsrdd 32+15, 17, 10 + mtvsrdd 32+16, 11, 18 + mtvsrdd 32+17, 0, 12 + mtvsrdd 32+18, 0, 16 + vmsumudm 19, 13, 14, 0 + vmsumudm 8, 15, 16, 19 + vmsumudm 8, 17, 18, 8 + + # out9 + mtvsrdd 32+13, 19, 9 + mtvsrdd 32+14, 10, 20 + mtvsrdd 32+15, 17, 11 + mtvsrdd 32+16, 12, 18 + vmsumudm 19, 13, 14, 0 + vmsumudm 9, 15, 16, 19 + + # out10 + mtvsrdd 32+13, 19, 10 + mtvsrdd 32+14, 11, 20 + mtvsrdd 32+15, 0, 12 + mtvsrdd 32+16, 0, 18 + vmsumudm 19, 13, 14, 0 + vmsumudm 10, 15, 16, 19 + + # out11 + mtvsrdd 32+17, 19, 11 + mtvsrdd 32+18, 12, 20 + vmsumudm 11, 17, 18, 0 + + stxv 32+6, 96(3) + stxv 32+7, 112(3) + stxv 32+8, 128(3) + stxv 32+9, 144(3) + stxv 32+10, 160(3) + stxv 32+11, 176(3) + + # out12 + mulld 21, 20, 12 + mulhdu 22, 20, 12 # out12 + + std 21, 192(3) + std 22, 200(3) + + blr +.size _p384_felem_mul_core,.-_p384_felem_mul_core + +# +# Felem square core function - +# r3 and r4 need to pre-loaded. +# +.type _p384_felem_square_core,\@function +.align 4 +_p384_felem_square_core: + + ld 6, 0(4) + ld 7, 8(4) + ld 8, 16(4) + ld 9, 24(4) + ld 10, 32(4) + ld 11, 40(4) + ld 12, 48(4) + + vxor 0, 0, 0 + + # out0 + mulld 14, 6, 6 + mulhdu 15, 6, 6 + std 14, 0(3) + std 15, 8(3) + + # out1 + add 14, 6, 6 + mtvsrdd 32+13, 0, 14 + mtvsrdd 32+14, 0, 7 + vmsumudm 1, 13, 14, 0 + + # out2 + mtvsrdd 32+15, 7, 14 + mtvsrdd 32+16, 7, 8 + vmsumudm 2, 15, 16, 0 + + # out3 + add 15, 7, 7 + mtvsrdd 32+13, 8, 14 + mtvsrdd 32+14, 15, 9 + vmsumudm 3, 13, 14, 0 + + # out4 + mtvsrdd 32+13, 9, 14 + mtvsrdd 32+14, 15, 10 + mtvsrdd 32+15, 0, 8 + vmsumudm 4, 13, 14, 0 + vmsumudm 4, 15, 15, 4 + + # out5 + mtvsrdd 32+13, 10, 14 + mtvsrdd 32+14, 15, 11 + add 16, 8, 8 + mtvsrdd 32+15, 0, 16 + mtvsrdd 32+16, 0, 9 + vmsumudm 5, 13, 14, 0 + vmsumudm 5, 15, 16, 5 + + stxv 32+1, 16(3) + stxv 32+2, 32(3) + stxv 32+3, 48(3) + stxv 32+4, 64(3) + + # out6 + mtvsrdd 32+13, 11, 14 + mtvsrdd 32+14, 15, 12 + mtvsrdd 32+15, 9, 16 + mtvsrdd 32+16, 9, 10 + stxv 32+5, 80(3) + vmsumudm 19, 13, 14, 0 + vmsumudm 6, 15, 16, 19 + + # out7 + add 17, 9, 9 + mtvsrdd 32+13, 11, 15 + mtvsrdd 32+14, 16, 12 + mtvsrdd 32+15, 0, 17 + mtvsrdd 32+16, 0, 10 + vmsumudm 19, 13, 14, 0 + vmsumudm 7, 15, 16, 19 + + # out8 + mtvsrdd 32+13, 11, 16 + mtvsrdd 32+14, 17, 12 + mtvsrdd 32+15, 0, 10 + vmsumudm 19, 13, 14, 0 + vmsumudm 8, 15, 15, 19 + + # out9 + add 14, 10, 10 + mtvsrdd 32+13, 11, 17 + mtvsrdd 32+14, 14, 12 + vmsumudm 9, 13, 14, 0 + + # out10 + mtvsrdd 32+13, 11, 14 + mtvsrdd 32+14, 11, 12 + vmsumudm 10, 13, 14, 0 + + stxv 32+6, 96(3) + stxv 32+7, 112(3) + + # out11 + #add 14, 11, 11 + #mtvsrdd 32+13, 0, 14 + #mtvsrdd 32+14, 0, 12 + #vmsumudm 11, 13, 14, 0 + + mulld 6, 12, 11 + mulhdu 7, 12, 11 + addc 8, 6, 6 + adde 9, 7, 7 + + stxv 32+8, 128(3) + stxv 32+9, 144(3) + stxv 32+10, 160(3) + #stxv 32+11, 176(3) + + # out12 + mulld 14, 12, 12 + mulhdu 15, 12, 12 + + std 8, 176(3) + std 9, 184(3) + std 14, 192(3) + std 15, 200(3) + + blr +.size _p384_felem_square_core,.-_p384_felem_square_core + +# +# widefelem (128 bits) * 8 +# +.macro F128_X_8 _off1 _off2 + ld 9,\\_off1(3) + ld 8,\\_off2(3) + srdi 10,9,61 + rldimi 10,8,3,0 + sldi 9,9,3 + std 9,\\_off1(3) + std 10,\\_off2(3) +.endm + +.globl p384_felem128_mul_by_8 +.type p384_felem128_mul_by_8, \@function +.align 4 +p384_felem128_mul_by_8: + + F128_X_8 0, 8 + + F128_X_8 16, 24 + + F128_X_8 32, 40 + + F128_X_8 48, 56 + + F128_X_8 64, 72 + + F128_X_8 80, 88 + + F128_X_8 96, 104 + + F128_X_8 112, 120 + + F128_X_8 128, 136 + + F128_X_8 144, 152 + + F128_X_8 160, 168 + + F128_X_8 176, 184 + + F128_X_8 192, 200 + + blr +.size p384_felem128_mul_by_8,.-p384_felem128_mul_by_8 + +# +# widefelem (128 bits) * 2 +# +.macro F128_X_2 _off1 _off2 + ld 9,\\_off1(3) + ld 8,\\_off2(3) + srdi 10,9,63 + rldimi 10,8,1,0 + sldi 9,9,1 + std 9,\\_off1(3) + std 10,\\_off2(3) +.endm + +.globl p384_felem128_mul_by_2 +.type p384_felem128_mul_by_2, \@function +.align 4 +p384_felem128_mul_by_2: + + F128_X_2 0, 8 + + F128_X_2 16, 24 + + F128_X_2 32, 40 + + F128_X_2 48, 56 + + F128_X_2 64, 72 + + F128_X_2 80, 88 + + F128_X_2 96, 104 + + F128_X_2 112, 120 + + F128_X_2 128, 136 + + F128_X_2 144, 152 + + F128_X_2 160, 168 + + F128_X_2 176, 184 + + F128_X_2 192, 200 + + blr +.size p384_felem128_mul_by_2,.-p384_felem128_mul_by_2 + +.globl p384_felem_diff128 +.type p384_felem_diff128, \@function +.align 4 +p384_felem_diff128: + + addis 5, 2, .LConst_two127\@toc\@ha + addi 5, 5, .LConst_two127\@toc\@l + + ld 10, 0(3) + ld 8, 8(3) + li 9, 0 + addc 10, 10, 9 + li 7, -1 + rldicr 7, 7, 0, 0 # two127 + adde 8, 8, 7 + ld 11, 0(4) + ld 12, 8(4) + subfc 11, 11, 10 + subfe 12, 12, 8 + std 11, 0(3) # out0 + std 12, 8(3) + + # two127m71 = (r10, r9) + ld 8, 16(3) + ld 7, 24(3) + ld 10, 24(5) # two127m71 + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 16(4) + ld 12, 24(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 16(3) # out1 + std 12, 24(3) + + ld 8, 32(3) + ld 7, 40(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 32(4) + ld 12, 40(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 32(3) # out2 + std 12, 40(3) + + ld 8, 48(3) + ld 7, 56(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 48(4) + ld 12, 56(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 48(3) # out3 + std 12, 56(3) + + ld 8, 64(3) + ld 7, 72(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 64(4) + ld 12, 72(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 64(3) # out4 + std 12, 72(3) + + ld 8, 80(3) + ld 7, 88(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 80(4) + ld 12, 88(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 80(3) # out5 + std 12, 88(3) + + ld 8, 96(3) + ld 7, 104(3) + ld 6, 40(5) # two127p111m79m71 + addc 8, 8, 9 + adde 7, 7, 6 + ld 11, 96(4) + ld 12, 104(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 96(3) # out6 + std 12, 104(3) + + ld 8, 112(3) + ld 7, 120(3) + ld 6, 56(5) # two127m119m71 + addc 8, 8, 9 + adde 7, 7, 6 + ld 11, 112(4) + ld 12, 120(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 112(3) # out7 + std 12, 120(3) + + ld 8, 128(3) + ld 7, 136(3) + ld 6, 72(5) # two127m95m71 + addc 8, 8, 9 + adde 7, 7, 6 + ld 11, 128(4) + ld 12, 136(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 128(3) # out8 + std 12, 136(3) + + ld 8, 144(3) + ld 7, 152(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 144(4) + ld 12, 152(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 144(3) # out9 + std 12, 152(3) + + ld 8, 160(3) + ld 7, 168(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 160(4) + ld 12, 168(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 160(3) # out10 + std 12, 168(3) + + ld 8, 176(3) + ld 7, 184(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 176(4) + ld 12, 184(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 176(3) # out11 + std 12, 184(3) + + ld 8, 192(3) + ld 7, 200(3) + addc 8, 8, 9 + adde 7, 7, 10 + ld 11, 192(4) + ld 12, 200(4) + subfc 11, 11, 8 + subfe 12, 12, 7 + std 11, 192(3) # out12 + std 12, 200(3) + + blr +.size p384_felem_diff128,.-p384_felem_diff128 + +.data +.align 4 +.LConst_two127: +#two127 +.long 0x00000000, 0x00000000, 0x00000000, 0x80000000 +#two127m71 +.long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff +#two127p111m79m71 +.long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff +#two127m119m71 +.long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff +#two127m95m71 +.long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff + +.text + +.globl p384_felem_diff_128_64 +.type p384_felem_diff_128_64, \@function +.align 4 +p384_felem_diff_128_64: + addis 5, 2, .LConst_128_two64\@toc\@ha + addi 5, 5, .LConst_128_two64\@toc\@l + + ld 9, 0(3) + ld 10, 8(3) + ld 8, 48(5) # two64p48m16 + li 7, 0 + addc 9, 9, 8 + li 6, 1 + adde 10, 10, 6 + ld 11, 0(4) + subfc 8, 11, 9 + subfe 12, 7, 10 + std 8, 0(3) # out0 + std 12, 8(3) + + ld 9, 16(3) + ld 10, 24(3) + ld 8, 0(5) # two64m56m8 + addc 9, 9, 8 + addze 10, 10 + ld 11, 8(4) + subfc 11, 11, 9 + subfe 12, 7, 10 + std 11, 16(3) # out1 + std 12, 24(3) + + ld 9, 32(3) + ld 10, 40(3) + ld 8, 16(5) # two64m32m8 + addc 9, 9, 8 + addze 10, 10 + ld 11, 16(4) + subfc 11, 11, 9 + subfe 12, 7, 10 + std 11, 32(3) # out2 + std 12, 40(3) + + ld 10, 48(3) + ld 8, 56(3) + #ld 9, 32(5) # two64m8 + li 9, -256 # two64m8 + addc 10, 10, 9 + addze 8, 8 + ld 11, 24(4) + subfc 11, 11, 10 + subfe 12, 7, 8 + std 11, 48(3) # out3 + std 12, 56(3) + + ld 10, 64(3) + ld 8, 72(3) + addc 10, 10, 9 + addze 8, 8 + ld 11, 32(4) + subfc 11, 11, 10 + subfe 12, 7, 8 + std 11, 64(3) # out4 + std 12, 72(3) + + ld 10, 80(3) + ld 8, 88(3) + addc 10, 10, 9 + addze 8, 8 + ld 11, 40(4) + subfc 11, 11, 10 + subfe 12, 7, 8 + std 11, 80(3) # out5 + std 12, 88(3) + + ld 10, 96(3) + ld 8, 104(3) + addc 10, 10, 9 + addze 9, 8 + ld 11, 48(4) + subfc 11, 11, 10 + subfe 12, 7, 9 + std 11, 96(3) # out6 + std 12, 104(3) + + blr +.size p384_felem_diff_128_64,.-p384_felem_diff_128_64 + +.data +.align 4 +.LConst_128_two64: +#two64m56m8 +.long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000 +#two64m32m8 +.long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000 +#two64m8 +.long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000 +#two64p48m16 +.long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000 + +.LConst_two60: +#two60m52m4 +.long 0xfffffff0, 0x0fefffff, 0x0, 0x0 +#two60p44m12 +.long 0xfffff000, 0x10000fff, 0x0, 0x0 +#two60m28m4 +.long 0xeffffff0, 0x0fffffff, 0x0, 0x0 +#two60m4 +.long 0xfffffff0, 0x0fffffff, 0x0, 0x0 + +.text +# +# static void felem_diff64(felem out, const felem in) +# +.globl p384_felem_diff64 +.type p384_felem_diff64, \@function +.align 4 +p384_felem_diff64: + addis 5, 2, .LConst_two60\@toc\@ha + addi 5, 5, .LConst_two60\@toc\@l + + ld 9, 0(3) + ld 8, 16(5) # two60p44m12 + li 7, 0 + add 9, 9, 8 + ld 11, 0(4) + subf 8, 11, 9 + std 8, 0(3) # out0 + + ld 9, 8(3) + ld 8, 0(5) # two60m52m4 + add 9, 9, 8 + ld 11, 8(4) + subf 11, 11, 9 + std 11, 8(3) # out1 + + ld 9, 16(3) + ld 8, 32(5) # two60m28m4 + add 9, 9, 8 + ld 11, 16(4) + subf 11, 11, 9 + std 11, 16(3) # out2 + + ld 10, 24(3) + ld 9, 48(5) # two60m4 + add 10, 10, 9 + ld 12, 24(4) + subf 12, 12, 10 + std 12, 24(3) # out3 + + ld 10, 32(3) + add 10, 10, 9 + ld 11, 32(4) + subf 11, 11, 10 + std 11, 32(3) # out4 + + ld 10, 40(3) + add 10, 10, 9 + ld 12, 40(4) + subf 12, 12, 10 + std 12, 40(3) # out5 + + ld 10, 48(3) + add 10, 10, 9 + ld 11, 48(4) + subf 11, 11, 10 + std 11, 48(3) # out6 + + blr +.size p384_felem_diff64,.-p384_felem_diff64 + +.text +# +# Shift 128 bits right +# +.macro SHR o_h o_l in_h in_l nbits + srdi \\o_l, \\in_l, \\nbits # shift lower right + rldimi \\o_l, \\in_h, 64-\\nbits, 0 # insert <64-nbits> from hi + srdi \\o_h, \\in_h, \\nbits # shift higher right +.endm + +# +# static void felem_reduce(felem out, const widefelem in) +# +.global p384_felem_reduce +.type p384_felem_reduce,\@function +.align 4 +p384_felem_reduce: + + stdu 1, -208(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + std 23, 128(1) + std 24, 136(1) + std 25, 144(1) + std 26, 152(1) + std 27, 160(1) + std 28, 168(1) + std 29, 176(1) + std 30, 184(1) + std 31, 192(1) + + bl _p384_felem_reduce_core + + mtlr 0 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + ld 23, 128(1) + ld 24, 136(1) + ld 25, 144(1) + ld 26, 152(1) + ld 27, 160(1) + ld 28, 168(1) + ld 29, 176(1) + ld 30, 184(1) + ld 31, 192(1) + addi 1, 1, 208 + blr +.size p384_felem_reduce,.-p384_felem_reduce + +# +# Felem reduction core function - +# r3 and r4 need to pre-loaded. +# +.type _p384_felem_reduce_core,\@function +.align 4 +_p384_felem_reduce_core: + addis 12, 2, .LConst\@toc\@ha + addi 12, 12, .LConst\@toc\@l + + # load constat p + ld 11, 8(12) # hi - two124m68 + + # acc[6] = in[6] + two124m68; + ld 26, 96(4) # in[6].l + ld 27, 96+8(4) # in[6].h + add 27, 27, 11 + + # acc[5] = in[5] + two124m68; + ld 24, 80(4) # in[5].l + ld 25, 80+8(4) # in[5].h + add 25, 25, 11 + + # acc[4] = in[4] + two124m68; + ld 22, 64(4) # in[4].l + ld 23, 64+8(4) # in[4].h + add 23, 23, 11 + + # acc[3] = in[3] + two124m68; + ld 20, 48(4) # in[3].l + ld 21, 48+8(4) # in[3].h + add 21, 21, 11 + + ld 11, 48+8(12) # hi - two124m92m68 + + # acc[2] = in[2] + two124m92m68; + ld 18, 32(4) # in[2].l + ld 19, 32+8(4) # in[2].h + add 19, 19, 11 + + ld 11, 16+8(12) # high - two124m116m68 + + # acc[1] = in[1] + two124m116m68; + ld 16, 16(4) # in[1].l + ld 17, 16+8(4) # in[1].h + add 17, 17, 11 + + ld 11, 32+8(12) # high - two124p108m76 + + # acc[0] = in[0] + two124p108m76; + ld 14, 0(4) # in[0].l + ld 15, 0+8(4) # in[0].h + add 15, 15, 11 + + # compute mask + li 7, -1 + + # Eliminate in[12] + + # acc[8] += in[12] >> 32; + ld 5, 192(4) # in[12].l + ld 6, 192+8(4) # in[12].h + SHR 9, 10, 6, 5, 32 + ld 30, 128(4) # in[8].l + ld 31, 136(4) # in[8].h + addc 30, 30, 10 + adde 31, 31, 9 + + # acc[7] += (in[12] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + ld 28, 112(4) # in[7].l + ld 29, 120(4) # in[7].h + addc 28, 28, 11 + addze 29, 29 + + # acc[7] += in[12] >> 8; + SHR 9, 10, 6, 5, 8 + addc 28, 28, 10 + adde 29, 29, 9 + + # acc[6] += (in[12] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 26, 26, 11 + addze 27, 27 + + # acc[6] -= in[12] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 26, 10, 26 + subfe 27, 9, 27 + + # acc[5] -= (in[12] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 24, 11, 24 + subfe 25, 9, 25 + + # acc[6] += in[12] >> 48; + SHR 9, 10, 6, 5, 48 + addc 26, 26, 10 + adde 27, 27, 9 + + # acc[5] += (in[12] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 24, 24, 11 + addze 25, 25 + + # Eliminate in[11] + + # acc[7] += in[11] >> 32; + ld 5, 176(4) # in[11].l + ld 6, 176+8(4) # in[11].h + SHR 9, 10, 6, 5, 32 + addc 28, 28, 10 + adde 29, 29, 9 + + # acc[6] += (in[11] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + addc 26, 26, 11 + addze 27, 27 + + # acc[6] += in[11] >> 8; + SHR 9, 10, 6, 5, 8 + addc 26, 26, 10 + adde 27, 27, 9 + + # acc[5] += (in[11] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 24, 24, 11 + addze 25, 25 + + # acc[5] -= in[11] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 24, 10, 24 + subfe 25, 9, 25 + + # acc[4] -= (in[11] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 22, 11, 22 + subfe 23, 9, 23 + + # acc[5] += in[11] >> 48; + SHR 9, 10, 6, 5, 48 + addc 24, 24, 10 + adde 25, 25, 9 + + # acc[4] += (in[11] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 22, 22, 11 + addze 23, 23 + + # Eliminate in[10] + + # acc[6] += in[10] >> 32; + ld 5, 160(4) # in[10].l + ld 6, 160+8(4) # in[10].h + SHR 9, 10, 6, 5, 32 + addc 26, 26, 10 + adde 27, 27, 9 + + # acc[5] += (in[10] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + addc 24, 24, 11 + addze 25, 25 + + # acc[5] += in[10] >> 8; + SHR 9, 10, 6, 5, 8 + addc 24, 24, 10 + adde 25, 25, 9 + + # acc[4] += (in[10] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 22, 22, 11 + addze 23, 23 + + # acc[4] -= in[10] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 22, 10, 22 + subfe 23, 9, 23 + + # acc[3] -= (in[10] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 20, 11, 20 + subfe 21, 9, 21 + + # acc[4] += in[10] >> 48; + SHR 9, 10, 6, 5, 48 + addc 22, 22, 10 + adde 23, 23, 9 + + # acc[3] += (in[10] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 20, 20, 11 + addze 21, 21 + + # Eliminate in[9] + + # acc[5] += in[9] >> 32; + ld 5, 144(4) # in[9].l + ld 6, 144+8(4) # in[9].h + SHR 9, 10, 6, 5, 32 + addc 24, 24, 10 + adde 25, 25, 9 + + # acc[4] += (in[9] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + addc 22, 22, 11 + addze 23, 23 + + # acc[4] += in[9] >> 8; + SHR 9, 10, 6, 5, 8 + addc 22, 22, 10 + adde 23, 23, 9 + + # acc[3] += (in[9] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 20, 20, 11 + addze 21, 21 + + # acc[3] -= in[9] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 20, 10, 20 + subfe 21, 9, 21 + + # acc[2] -= (in[9] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 18, 11, 18 + subfe 19, 9, 19 + + # acc[3] += in[9] >> 48; + SHR 9, 10, 6, 5, 48 + addc 20, 20, 10 + adde 21, 21, 9 + + # acc[2] += (in[9] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 18, 18, 11 + addze 19, 19 + + # Eliminate acc[8] + + # acc[4] += acc[8] >> 32; + mr 5, 30 # acc[8].l + mr 6, 31 # acc[8].h + SHR 9, 10, 6, 5, 32 + addc 22, 22, 10 + adde 23, 23, 9 + + # acc[3] += (acc[8] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + addc 20, 20, 11 + addze 21, 21 + + # acc[3] += acc[8] >> 8; + SHR 9, 10, 6, 5, 8 + addc 20, 20, 10 + adde 21, 21, 9 + + # acc[2] += (acc[8] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 18, 18, 11 + addze 19, 19 + + # acc[2] -= acc[8] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 18, 10, 18 + subfe 19, 9, 19 + + # acc[1] -= (acc[8] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 16, 11, 16 + subfe 17, 9, 17 + + #acc[2] += acc[8] >> 48; + SHR 9, 10, 6, 5, 48 + addc 18, 18, 10 + adde 19, 19, 9 + + # acc[1] += (acc[8] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 16, 16, 11 + addze 17, 17 + + # Eliminate acc[7] + + # acc[3] += acc[7] >> 32; + mr 5, 28 # acc[7].l + mr 6, 29 # acc[7].h + SHR 9, 10, 6, 5, 32 + addc 20, 20, 10 + adde 21, 21, 9 + + # acc[2] += (acc[7] & 0xffffffff) << 24; + srdi 11, 7, 32 # 0xffffffff + and 11, 11, 5 + sldi 11, 11, 24 # << 24 + addc 18, 18, 11 + addze 19, 19 + + # acc[2] += acc[7] >> 8; + SHR 9, 10, 6, 5, 8 + addc 18, 18, 10 + adde 19, 19, 9 + + # acc[1] += (acc[7] & 0xff) << 48; + andi. 11, 5, 0xff + sldi 11, 11, 48 + addc 16, 16, 11 + addze 17, 17 + + # acc[1] -= acc[7] >> 16; + SHR 9, 10, 6, 5, 16 + subfc 16, 10, 16 + subfe 17, 9, 17 + + # acc[0] -= (acc[7] & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 11, 11, 5 + sldi 11, 11, 40 # << 40 + li 9, 0 + subfc 14, 11, 14 + subfe 15, 9, 15 + + # acc[1] += acc[7] >> 48; + SHR 9, 10, 6, 5, 48 + addc 16, 16, 10 + adde 17, 17, 9 + + # acc[0] += (acc[7] & 0xffffffffffff) << 8; + srdi 11, 7, 16 # 0xffffffffffff + and 11, 11, 5 + sldi 11, 11, 8 # << 8 + addc 14, 14, 11 + addze 15, 15 + + # + # Carry 4 -> 5 -> 6 + # + # acc[5] += acc[4] >> 56; + # acc[4] &= 0x00ffffffffffffff; + SHR 9, 10, 23, 22, 56 + addc 24, 24, 10 + adde 25, 25, 9 + srdi 11, 7, 8 # 0x00ffffffffffffff + and 22, 22, 11 + li 23, 0 + + # acc[6] += acc[5] >> 56; + # acc[5] &= 0x00ffffffffffffff; + SHR 9, 10, 25, 24, 56 + addc 26, 26, 10 + adde 27, 27, 9 + and 24, 24, 11 + li 25, 0 + + # [3]: Eliminate high bits of acc[6] */ + # temp = acc[6] >> 48; + # acc[6] &= 0x0000ffffffffffff; + SHR 31, 30, 27, 26, 48 # temp = acc[6] >> 48 + srdi 11, 7, 16 # 0x0000ffffffffffff + and 26, 26, 11 + li 27, 0 + + # temp < 2^80 + # acc[3] += temp >> 40; + SHR 9, 10, 31, 30, 40 + addc 20, 20, 10 + adde 21, 21, 9 + + # acc[2] += (temp & 0xffffffffff) << 16; + srdi 11, 7, 24 # 0xffffffffff + and 10, 30, 11 + sldi 10, 10, 16 + addc 18, 18, 10 + addze 19, 19 + + # acc[2] += temp >> 16; + SHR 9, 10, 31, 30, 16 + addc 18, 18, 10 + adde 19, 19, 9 + + # acc[1] += (temp & 0xffff) << 40; + srdi 11, 7, 48 # 0xffff + and 10, 30, 11 + sldi 10, 10, 40 + addc 16, 16, 10 + addze 17, 17 + + # acc[1] -= temp >> 24; + SHR 9, 10, 31, 30, 24 + subfc 16, 10, 16 + subfe 17, 9, 17 + + # acc[0] -= (temp & 0xffffff) << 32; + srdi 11, 7, 40 # 0xffffff + and 10, 30, 11 + sldi 10, 10, 32 + li 9, 0 + subfc 14, 10, 14 + subfe 15, 9, 15 + + # acc[0] += temp; + addc 14, 14, 30 + adde 15, 15, 31 + + # Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 + # + # acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */ + SHR 9, 10, 15, 14, 56 + addc 16, 16, 10 + adde 17, 17, 9 + + # acc[0] &= 0x00ffffffffffffff; + srdi 11, 7, 8 # 0x00ffffffffffffff + and 14, 14, 11 + li 15, 0 + + # acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */ + SHR 9, 10, 17, 16, 56 + addc 18, 18, 10 + adde 19, 19, 9 + + # acc[1] &= 0x00ffffffffffffff; + and 16, 16, 11 + li 17, 0 + + # acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */ + SHR 9, 10, 19, 18, 56 + addc 20, 20, 10 + adde 21, 21, 9 + + # acc[2] &= 0x00ffffffffffffff; + and 18, 18, 11 + li 19, 0 + + # acc[4] += acc[3] >> 56; + SHR 9, 10, 21, 20, 56 + addc 22, 22, 10 + adde 23, 23, 9 + + # acc[3] &= 0x00ffffffffffffff; + and 20, 20, 11 + li 21, 0 + + # acc[5] += acc[4] >> 56; + SHR 9, 10, 23, 22, 56 + addc 24, 24, 10 + adde 25, 25, 9 + + # acc[4] &= 0x00ffffffffffffff; + and 22, 22, 11 + + # acc[6] += acc[5] >> 56; + SHR 9, 10, 25, 24, 56 + addc 26, 26, 10 + adde 27, 27, 9 + + # acc[5] &= 0x00ffffffffffffff; + and 24, 24, 11 + + std 14, 0(3) + std 16, 8(3) + std 18, 16(3) + std 20, 24(3) + std 22, 32(3) + std 24, 40(3) + std 26, 48(3) + blr +.size _p384_felem_reduce_core,.-_p384_felem_reduce_core + +.data +.align 4 +.LConst: +# two124m68: +.long 0x0, 0x0, 0xfffffff0, 0xfffffff +# two124m116m68: +.long 0x0, 0x0, 0xfffffff0, 0xfefffff +#two124p108m76: +.long 0x0, 0x0, 0xfffff000, 0x10000fff +#two124m92m68: +.long 0x0, 0x0, 0xeffffff0, 0xfffffff + +.text + +# +# void p384_felem_square_reduce(felem out, const felem in) +# +.global p384_felem_square_reduce +.type p384_felem_square_reduce,\@function +.align 4 +p384_felem_square_reduce: + stdu 1, -512(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + std 23, 128(1) + std 24, 136(1) + std 25, 144(1) + std 26, 152(1) + std 27, 160(1) + std 28, 168(1) + std 29, 176(1) + std 30, 184(1) + std 31, 192(1) + + std 3, 496(1) + addi 3, 1, 208 + bl _p384_felem_square_core + + mr 4, 3 + ld 3, 496(1) + bl _p384_felem_reduce_core + + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + ld 23, 128(1) + ld 24, 136(1) + ld 25, 144(1) + ld 26, 152(1) + ld 27, 160(1) + ld 28, 168(1) + ld 29, 176(1) + ld 30, 184(1) + ld 31, 192(1) + addi 1, 1, 512 + mtlr 0 + blr +.size p384_felem_square_reduce,.-p384_felem_square_reduce + +# +# void p384_felem_mul_reduce(felem out, const felem in1, const felem in2) +# +.global p384_felem_mul_reduce +.type p384_felem_mul_reduce,\@function +.align 5 +p384_felem_mul_reduce: + stdu 1, -512(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + std 23, 128(1) + std 24, 136(1) + std 25, 144(1) + std 26, 152(1) + std 27, 160(1) + std 28, 168(1) + std 29, 176(1) + std 30, 184(1) + std 31, 192(1) + + std 3, 496(1) + addi 3, 1, 208 + bl _p384_felem_mul_core + + mr 4, 3 + ld 3, 496(1) + bl _p384_felem_reduce_core + + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + ld 23, 128(1) + ld 24, 136(1) + ld 25, 144(1) + ld 26, 152(1) + ld 27, 160(1) + ld 28, 168(1) + ld 29, 176(1) + ld 30, 184(1) + ld 31, 192(1) + addi 1, 1, 512 + mtlr 0 + blr +.size p384_felem_mul_reduce,.-p384_felem_mul_reduce ___ -{ - # mul/square common - my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43"); - my ($zero, $one) = ("r8", "r9"); - my $out = "v51"; - - { - # - # p384_felem_mul - # - - my ($in1p, $in2p) = ("r4", "r5"); - my @in1 = map("v$_",(44..50)); - my @in2 = map("v$_",(35..41)); - - startproc("p384_felem_mul"); - - $code.=<<___; - vspltisw $vzero,0 - -___ - - load_vrs($in1p, \@in1); - load_vrs($in2p, \@in2); - - $code.=<<___; - vmsumudm $out,$in1[0],$in2[0],$vzero - stxv $out,0($outp) - - xxpermdi $t1,$in1[0],$in1[1],0b00 - xxpermdi $t2,$in2[1],$in2[0],0b00 - vmsumudm $out,$t1,$t2,$vzero - stxv $out,16($outp) - - xxpermdi $t2,$in2[2],$in2[1],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$in1[2],$in2[0],$out - stxv $out,32($outp) - - xxpermdi $t2,$in2[1],$in2[0],0b00 - xxpermdi $t3,$in1[2],$in1[3],0b00 - xxpermdi $t4,$in2[3],$in2[2],0b00 - vmsumudm $out,$t1,$t4,$vzero - vmsumudm $out,$t3,$t2,$out - stxv $out,48($outp) - - xxpermdi $t2,$in2[4],$in2[3],0b00 - xxpermdi $t4,$in2[2],$in2[1],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$t3,$t4,$out - vmsumudm $out,$in1[4],$in2[0],$out - stxv $out,64($outp) - - xxpermdi $t2,$in2[5],$in2[4],0b00 - xxpermdi $t4,$in2[3],$in2[2],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$t3,$t4,$out - xxpermdi $t4,$in2[1],$in2[0],0b00 - xxpermdi $t1,$in1[4],$in1[5],0b00 - vmsumudm $out,$t1,$t4,$out - stxv $out,80($outp) - - xxpermdi $t1,$in1[0],$in1[1],0b00 - xxpermdi $t2,$in2[6],$in2[5],0b00 - xxpermdi $t4,$in2[4],$in2[3],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$t3,$t4,$out - xxpermdi $t2,$in2[2],$in2[1],0b00 - xxpermdi $t1,$in1[4],$in1[5],0b00 - vmsumudm $out,$t1,$t2,$out - vmsumudm $out,$in1[6],$in2[0],$out - stxv $out,96($outp) - - xxpermdi $t1,$in1[1],$in1[2],0b00 - xxpermdi $t2,$in2[6],$in2[5],0b00 - xxpermdi $t3,$in1[3],$in1[4],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$t3,$t4,$out - xxpermdi $t3,$in2[2],$in2[1],0b00 - xxpermdi $t1,$in1[5],$in1[6],0b00 - vmsumudm $out,$t1,$t3,$out - stxv $out,112($outp) - - xxpermdi $t1,$in1[2],$in1[3],0b00 - xxpermdi $t3,$in1[4],$in1[5],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$t3,$t4,$out - vmsumudm $out,$in1[6],$in2[2],$out - stxv $out,128($outp) - - xxpermdi $t1,$in1[3],$in1[4],0b00 - vmsumudm $out,$t1,$t2,$vzero - xxpermdi $t1,$in1[5],$in1[6],0b00 - vmsumudm $out,$t1,$t4,$out - stxv $out,144($outp) - - vmsumudm $out,$t3,$t2,$vzero - vmsumudm $out,$in1[6],$in2[4],$out - stxv $out,160($outp) - - vmsumudm $out,$t1,$t2,$vzero - stxv $out,176($outp) - - vmsumudm $out,$in1[6],$in2[6],$vzero - stxv $out,192($outp) -___ - - endproc("p384_felem_mul"); - } - - { - # - # p384_felem_square - # - - my ($inp) = ("r4"); - my @in = map("v$_",(44..50)); - my @inx2 = map("v$_",(35..41)); - - startproc("p384_felem_square"); - - $code.=<<___; - vspltisw $vzero,0 - -___ - - load_vrs($inp, \@in); - - $code.=<<___; - li $zero,0 - li $one,1 - mtvsrdd $t1,$one,$zero -___ - - for (my $i = 0; $i <= 6; $i++) { - $code.=<<___; - vsld $inx2[$i],$in[$i],$t1 -___ - } - - $code.=<<___; - vmsumudm $out,$in[0],$in[0],$vzero - stxv $out,0($outp) - - vmsumudm $out,$in[0],$inx2[1],$vzero - stxv $out,16($outp) - - vmsumudm $out,$in[0],$inx2[2],$vzero - vmsumudm $out,$in[1],$in[1],$out - stxv $out,32($outp) - - xxpermdi $t1,$in[0],$in[1],0b00 - xxpermdi $t2,$inx2[3],$inx2[2],0b00 - vmsumudm $out,$t1,$t2,$vzero - stxv $out,48($outp) - - xxpermdi $t4,$inx2[4],$inx2[3],0b00 - vmsumudm $out,$t1,$t4,$vzero - vmsumudm $out,$in[2],$in[2],$out - stxv $out,64($outp) - - xxpermdi $t2,$inx2[5],$inx2[4],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$in[2],$inx2[3],$out - stxv $out,80($outp) - - xxpermdi $t2,$inx2[6],$inx2[5],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$in[2],$inx2[4],$out - vmsumudm $out,$in[3],$in[3],$out - stxv $out,96($outp) - - xxpermdi $t3,$in[1],$in[2],0b00 - vmsumudm $out,$t3,$t2,$vzero - vmsumudm $out,$in[3],$inx2[4],$out - stxv $out,112($outp) - - xxpermdi $t1,$in[2],$in[3],0b00 - vmsumudm $out,$t1,$t2,$vzero - vmsumudm $out,$in[4],$in[4],$out - stxv $out,128($outp) - - xxpermdi $t1,$in[3],$in[4],0b00 - vmsumudm $out,$t1,$t2,$vzero - stxv $out,144($outp) - - vmsumudm $out,$in[4],$inx2[6],$vzero - vmsumudm $out,$in[5],$in[5],$out - stxv $out,160($outp) - - vmsumudm $out,$in[5],$inx2[6],$vzero - stxv $out,176($outp) - - vmsumudm $out,$in[6],$in[6],$vzero - stxv $out,192($outp) -___ - - endproc("p384_felem_square"); - } -} - $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c index 44ac1cea3d..1bdf0e7427 100644 --- a/crypto/ec/ecp_nistp384.c +++ b/crypto/ec/ecp_nistp384.c @@ -252,6 +252,16 @@ static void felem_neg(felem out, const felem in) out[6] = two60m4 - in[6]; } +#if defined(ECP_NISTP384_ASM) +void p384_felem_diff64(felem out, const felem in); +void p384_felem_diff128(widefelem out, const widefelem in); +void p384_felem_diff_128_64(widefelem out, const felem in); + +# define felem_diff64 p384_felem_diff64 +# define felem_diff128 p384_felem_diff128 +# define felem_diff_128_64 p384_felem_diff_128_64 + +#else /*- * felem_diff64 subtracts |in| from |out| * On entry: @@ -369,6 +379,7 @@ static void felem_diff128(widefelem out, const widefelem in) for (i = 0; i < 2*NLIMBS-1; i++) out[i] -= in[i]; } +#endif /* ECP_NISTP384_ASM */ static void felem_square_ref(widefelem out, const felem in) { @@ -503,7 +514,7 @@ static void felem_mul_ref(widefelem out, const felem in1, const felem in2) * [3]: Y = 2^48 (acc[6] >> 48) * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d) */ -static void felem_reduce(felem out, const widefelem in) +static void felem_reduce_ref(felem out, const widefelem in) { /* * In order to prevent underflow, we add a multiple of p before subtracting. @@ -682,8 +693,11 @@ static void (*felem_square_p)(widefelem out, const felem in) = static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = felem_mul_wrapper; +static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref; + void p384_felem_square(widefelem out, const felem in); void p384_felem_mul(widefelem out, const felem in1, const felem in2); +void p384_felem_reduce(felem out, const widefelem in); # if defined(_ARCH_PPC64) # include "crypto/ppc_arch.h" @@ -695,6 +709,7 @@ static void felem_select(void) if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { felem_square_p = p384_felem_square; felem_mul_p = p384_felem_mul; + felem_reduce_p = p384_felem_reduce; return; } @@ -703,6 +718,7 @@ static void felem_select(void) /* Default */ felem_square_p = felem_square_ref; felem_mul_p = felem_mul_ref; + felem_reduce_p = p384_felem_reduce; } static void felem_square_wrapper(widefelem out, const felem in) @@ -719,10 +735,17 @@ static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) # define felem_square felem_square_p # define felem_mul felem_mul_p +# define felem_reduce felem_reduce_p + +void p384_felem_square_reduce(felem out, const felem in); +void p384_felem_mul_reduce(felem out, const felem in1, const felem in2); + +# define felem_square_reduce p384_felem_square_reduce +# define felem_mul_reduce p384_felem_mul_reduce #else # define felem_square felem_square_ref # define felem_mul felem_mul_ref -#endif +# define felem_reduce felem_reduce_ref static ossl_inline void felem_square_reduce(felem out, const felem in) { @@ -739,6 +762,7 @@ static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem felem_mul(tmp, in1, in2); felem_reduce(out, tmp); } +#endif /*- * felem_inv calculates |out| = |in|^{-1}