diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..ad8728ec2a --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Object files +*.o + +# Top level excludes +/Makefile.bak +/Makefile +/*.a +/include +/*.pc +/rehash.time + +# Most *.c files under test/ are symlinks +/test/*.c +# Apart from these +!/test/asn1test.c +!/test/methtest.c +!/test/dummytest.c +!/test/igetest.c +!/test/r160test.c +!/test/fips_algvs.c + +# Certificate symbolic links +*.0 + +# Links under apps +/apps/CA.pl +/apps/md4.c + + +# Auto generated headers +/crypto/buildinf.h +/crypto/opensslconf.h + +# Auto generated assembly language source files +*.s +!/crypto/bn/asm/pa-risc2.s +!/crypto/bn/asm/pa-risc2W.s + +# Executables +/apps/openssl +/test/sha256t +/test/sha512t +/test/*test +/test/fips_aesavs +/test/fips_desmovs +/test/fips_dhvs +/test/fips_drbgvs +/test/fips_dssvs +/test/fips_ecdhvs +/test/fips_ecdsavs +/test/fips_rngvs +/test/fips_test_suite +*.so* +*.dylib* +*.dll* +# Exceptions +!/test/bctest +!/crypto/des/times/486-50.sol + +# Misc auto generated files +/tools/c_rehash +/test/evptests.txt +lib +Makefile.save +*.bak +# FIPS module specific files. +/fips/fips_auth.h +/fips/fips_standalone_sha1 +/fips/fipscanister.o.sha1 + diff --git a/CHANGES b/CHANGES index 0d70e034da..8c8a09ce2b 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,47 @@ Changes between 1.0.1 and 1.1.0 [xx XXX xxxx] + *) Add perl scripts to calculate FIPS signatures for Windows + exectuables including WinCE. + [Andy Polyakov] + + *) Don't attempt to insert current time into AES/3DES tests, we should + be just copying input line across and this breaks some systems lacking + ctime. + [Steve Henson] + + *) Update Windows build system for FIPS. Don't compile algorithm test + utilties by default: the target build_tests is needed for that. Add + support for building fips_algvs with the build_algvs target. + [Steve Henson] + + *) Add initial cross compilation support for Windows build. The following + environment variables should be set: + + FIPS_SHA1_PATH: path to fips_standalone_sha1 exectutable which will + be used explicitly and not built. + FIPS_SIG: similar to other builds: path to a "get signature" script + which is used to obtain the signature of the target instead of + executing it on the host. + [Steve Henson] + + *) Add flag to EC_KEY to use cofactor ECDH if set. + [Steve Henson] + + *) Update fips_test_suite to support multiple command line options. New + test to induce all self test errors in sequence and check expected + failures. + [Steve Henson] + + *) Add FIPS_{rsa,dsa,ecdsa}_{sign,verify} functions which digest and + sign or verify all in one operation. + [Steve Henson] + + *) Add fips_algvs: a multicall fips utility incorporaing all the algorithm + test programs and fips_test_suite. Includes functionality to parse + the minimal script output of fipsalgest.pl directly. + [Steve Henson] + *) Add authorisation parameter to FIPS_module_mode_set(). [Steve Henson] diff --git a/Configure b/Configure index 10ef8cd115..679252e415 100755 --- a/Configure +++ b/Configure @@ -132,14 +132,17 @@ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; -my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; +# EXTREME: original asm spec was missing colon and final term. +#my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; +my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::::void"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; +my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; -my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; -my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; +my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; +my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:"; my $no_asm=":::::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, @@ -341,6 +344,8 @@ my %table=( # *-generic* is endian-neutral target, but ./config is free to # throw in -D[BL]_ENDIAN, whichever appropriate... "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +#### Extreme add linux-mips32be +"linux-mips32be","gcc:-DB_ENDIAN -DTERMIO -O3 -march=mips32 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips32_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc32_asm}:linux32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # It's believed that majority of ARM toolchains predefine appropriate -march. # If you compiler does not, do complement config command line with one! @@ -401,7 +406,8 @@ my %table=( # Android: linux-* but without -DTERMIO and pointers to headers and libs. "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "android-x86","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:".eval{my $asm=${x86_elf_asm};$asm=~s/:elf/:android/;$asm}.":dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-pie%-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"android64-aarch64","gcc:-mandroid -fPIC -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -Wall::-D_REENTRANT::-pie%-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", #### *BSD [do see comment about ${BSDthreads} above!] "BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -409,6 +415,8 @@ my %table=( "BSD-x86-elf", "gcc:-DL_ENDIAN -DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "debug-BSD-x86-elf", "gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall -g::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-sparcv8", "gcc:-DB_ENDIAN -DTERMIOS -O3 -mv8 -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${sparcv8_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"BSD-ppc85xx","gcc:-DTERMIOS -O3 -fomit-frame-pointer -msoft-float -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"debug-BSD-ppc85xx","gcc:-DTERMIOS -O0 -fomit-frame-pointer -msoft-float -Wall -g::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-generic64","gcc:-DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # -DMD32_REG_T=int doesn't actually belong in sparc64 target, it @@ -461,8 +469,8 @@ my %table=( "aix64-gcc","gcc:-maix64 -O -DB_ENDIAN::-pthread:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-maix64 -shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X64", # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE # at build time. $OBJECT_MODE is respected at ./config stage! -"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-q32 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32", -"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-q64 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64", +"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded -D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-q32 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32", +"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded -D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-q64 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64", # # Cray T90 and similar (SDSC) @@ -578,6 +586,24 @@ my %table=( "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin64-x86_64-cc","cc:-arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +# iPhoneOS/iOS +# +# It takes three prior-set environment variables to make it work: +# +# CROSS_COMPILE=/where/toolchain/is/usr/bin/ [note ending slash] +# CROSS_TOP=/where/SDKs/are +# CROSS_SDK=iPhoneOSx.y.sdk +# +# Exact paths vary with Xcode releases, but for couple of last ones +# they would look like this: +# +# CROSS_COMPILE=`xcode-select --print-path`/Toolchains/XcodeDefault.xctoolchain/usr/bin/ +# CROSS_TOP=`xcode-select --print-path`/Platforms/iPhoneOS.platform/Developer +# CROSS_SDK=iPhoneOS7.0.sdk +# +"iphoneos-cross","cc:-O3 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fomit-frame-pointer -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"ios-cross","cc:-O3 -arch armv7 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:armcap.o armv4cpuid_ios.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::ios32:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", +"ios64-cross","cc:-O3 -arch arm64 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR -RC4_CHUNK DES_INT DES_UNROLL -BF_PTR:${aarch64_asm}:ios64:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", ##### A/UX "aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::", @@ -594,6 +620,7 @@ my %table=( ##### VxWorks for various targets "vxworks-ppc60x","ccppc:-D_REENTRANT -mrtp -mhard-float -mstrict-align -fno-implicit-fp -DPPC32_fp60x -O2 -fstrength-reduce -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/common:::::", "vxworks-ppcgen","ccppc:-D_REENTRANT -mrtp -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/sfcommon:::::", +"vxworks-ppcgen-kernel","ccppc:-D_REENTRANT -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/h -I\$(WIND_BASE)/target/h/wrn/coreip:::VXWORKS::::::", "vxworks-ppc405","ccppc:-g -msoft-float -mlongcall -DCPU=PPC405 -I\$(WIND_BASE)/target/h:::VXWORKS:-r:::::", "vxworks-ppc750","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h \$(DEBUG_FLAG):::VXWORKS:-r:::::", "vxworks-ppc750-debug","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -DDEBUG -g:::VXWORKS:-r:::::", @@ -608,12 +635,15 @@ my %table=( "uClinux-dist","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):BN_LLONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::", "uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::", +"c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:", +"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:", + ); my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A debug-VC-WIN64I debug-VC-WIN64A VC-NT VC-CE VC-WIN32 debug-VC-WIN32 - BC-32 + BC-32 c64xplus c64x netware-clib netware-clib-bsdsock netware-libc netware-libc-bsdsock); @@ -906,6 +936,7 @@ EOF } elsif (/^-[^-]/ or /^\+/) { + $_ =~ s/%([0-9a-f]{1,2})/chr(hex($1))/gei; $flags.=$_." "; } elsif (/^--prefix=(.*)$/) @@ -1553,7 +1584,7 @@ if ($rmd160_obj =~ /\.o$/) } if ($aes_obj =~ /\.o$/) { - $cflags.=" -DAES_ASM"; + $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/); # aes_ctr.o is not a real file, only indication that assembler # module implements AES_ctr32_encrypt... $cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes_ctr\.o//); @@ -1574,7 +1605,7 @@ else { $wp_obj="wp_block.o"; } $cmll_obj=$cmll_enc unless ($cmll_obj =~ /.o$/); -if ($modes_obj =~ /ghash/) +if ($modes_obj =~ /ghash\-/) { $cflags.=" -DGHASH_ASM"; } diff --git a/Makefile.fips b/Makefile.fips index 703c9f9228..74db06574b 100644 --- a/Makefile.fips +++ b/Makefile.fips @@ -186,7 +186,7 @@ SHARED_LDFLAGS= GENERAL= Makefile BASENAME= openssl NAME= $(BASENAME)-$(VERSION) -TARFILE= openssl-fips-2.0-test.tar +TARFILE= openssl-fips-2.0.tar WTARFILE= $(NAME)-win.tar EXHEADER= e_os2.h HEADER= e_os.h @@ -387,6 +387,8 @@ build_apps: @dir=apps; target=all; $(BUILD_ONE_CMD) build_tests: @dir=test; target=fipsexe; $(BUILD_ONE_CMD) +build_algvs: + @dir=test; target=fipsalgvs; $(BUILD_ONE_CMD) build_tools: @dir=tools; target=all; $(BUILD_ONE_CMD) @@ -522,8 +524,8 @@ files: links: @$(PERL) $(TOP)/util/mkdir-p.pl include/openssl @$(PERL) $(TOP)/util/mklink.pl include/openssl $(EXHEADER) - @set -e; dir=fips target=links; $(RECURSIVE_BUILD_CMD) - @(cd crypto ; SDIRS='$(LINKDIRS)' $(MAKE) -e links) + @set -e; dir=fips target=links; $(BUILD_ONE_CMD) + @(cd crypto ; TEST='' SDIRS='$(LINKDIRS)' $(MAKE) -e links) gentests: @(cd test && echo "generating dummy tests (if needed)..." && \ @@ -536,9 +538,7 @@ dclean: test: tests tests: - @(cd test && echo "testing..." && \ - $(CLEARENV) && $(MAKE) -e $(BUILDENV) TOP=.. TESTS='$(TESTS)' OPENSSL_DEBUG_MEMORY=on OPENSSL_CONF=../apps/openssl.cnf tests ); - OPENSSL_CONF=apps/openssl.cnf util/opensslwrap.sh version -a + @echo "Not implemented in FIPS build" ; false report: @$(PERL) util/selftest.pl diff --git a/README.FIPS b/README.FIPS index c41bab9930..87253f6bfb 100644 --- a/README.FIPS +++ b/README.FIPS @@ -1,4 +1,4 @@ -Preliminary status and build information for FIPS module v2.0 +Preliminary status and build information for FIPS module v2.0 NB: if you are cross compiling you now need to use the latest "incore" script this can be found at util/incore in the tarballs. diff --git a/README.wishlist b/README.wishlist new file mode 100644 index 0000000000..111ee3ce75 --- /dev/null +++ b/README.wishlist @@ -0,0 +1,31 @@ +A "wish list" of changes we'd like to make to the FIPS module if we could. +Note the CMVP requires retesting of all previously tested platforms +("Operational Environments") to implement any changes considered "cryptographically +significant". Since the OpenSSL FIPS module v2.0 has some 250 such formally +tested platforms (and counting), retesting just isn't logistically or economically +feasible. + +-------- +https://github.com/openssl/openssl/pull/4157 +From 2017-08-14, Fix GCM MAC computation for AES-GCM by srahul123 +cryptographically significant, not fixable + +-------- +Andy Polyakov: harmonize with __thumb__ clause in FIPS_ref_point() (#3354), +https://patch-diff.githubusercontent.com/raw/openssl/openssl/pull/3354.patch +https://github.com/openssl/openssl/pull/3354#pullrequestreview-36086406 +May be possible to introduce in future change letter + +-------- +CVE-2016-0701 +cryptographically significant, not fixable + +-------- +CVE-2014-0076 +cryptographically significant, not fixable + +-------- +"Lucky 13", CVE-2013-0169 +cryptographically significant, not fixable + +-------- diff --git a/TABLE b/TABLE index c15ac01cb4..8bdf72045d 100644 --- a/TABLE +++ b/TABLE @@ -862,7 +862,7 @@ $multilib = $cc = cc $cflags = -q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst $unistd = -$thread_cflag = -qthreaded +$thread_cflag = -qthreaded -D_THREAD_SAFE $sys_id = AIX $lflags = $bn_ops = BN_LLONG RC4_CHAR @@ -961,7 +961,7 @@ $multilib = $cc = cc $cflags = -q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst $unistd = -$thread_cflag = -qthreaded +$thread_cflag = -qthreaded -D_THREAD_SAFE $sys_id = AIX $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR @@ -3465,6 +3465,73 @@ $ranlib = $arflags = $multilib = +*** ios64-cross +$cc = cc +$cflags = -O3 -arch arm64 -mios-version-min=7.0.0 -isysroot $(CROSS_TOP)/SDKs/$(CROSS_SDK) -fno-common +$unistd = +$thread_cflag = -D_REENTRANT +$sys_id = iOS +$lflags = -Wl,-search_paths_first% +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR -RC4_CHUNK DES_INT DES_UNROLL -BF_PTR +$cpuid_obj = +$bn_obj = +$ec_obj = +$des_obj = +$aes_obj = +$bf_obj = +$md5_obj = +$sha1_obj = +$cast_obj = +$rc4_obj = +$rmd160_obj = +$rc5_obj = +$wp_obj = +$cmll_obj = +$modes_obj = +$engines_obj = +$perlasm_scheme = void +$dso_scheme = dlfcn +$shared_target= darwin-shared +$shared_cflag = -fPIC -fno-common +$shared_ldflag = -dynamiclib +$shared_extension = .$(SHLIB_MAJOR).$(SHLIB_MINOR).dylib +$ranlib = +$arflags = +$multilib = + +*** iphoneos-cross +$cc = cc +$cflags = -O3 -isysroot $(CROSS_TOP)/SDKs/$(CROSS_SDK) -fomit-frame-pointer -fno-common +$unistd = +$thread_cflag = -D_REENTRANT +$sys_id = iOS +$lflags = -Wl,-search_paths_first% +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR +$cpuid_obj = +$bn_obj = +$des_obj = +$aes_obj = +$bf_obj = +$md5_obj = +$sha1_obj = +$cast_obj = +$rc4_obj = +$rmd160_obj = +$rc5_obj = +$wp_obj = +$cmll_obj = +$modes_obj = +$engines_obj = +$perlasm_scheme = void +$dso_scheme = dlfcn +$shared_target= darwin-shared +$shared_cflag = -fPIC -fno-common +$shared_ldflag = -dynamiclib +$shared_extension = .$(SHLIB_MAJOR).$(SHLIB_MINOR).dylib +$ranlib = +$arflags = +$multilib = + *** irix-cc $cc = cc $cflags = -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN diff --git a/c6x/do_fips b/c6x/do_fips new file mode 100755 index 0000000000..4045e605ce --- /dev/null +++ b/c6x/do_fips @@ -0,0 +1,12 @@ +#!/bin/sh + +if ! which cl6x > /dev/null 2>&1; then + echo 'fatal: cl6x is not on $PATH' + exit 1 +fi + +perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine +perl util/mkfiles.pl > MINFO +perl util/mk1mf.pl auto > c6x/fips.mak +make -f c6x/fips.mak +make -f c6x/fips_algvs.mak diff --git a/c6x/env b/c6x/env new file mode 100644 index 0000000000..543d33081e --- /dev/null +++ b/c6x/env @@ -0,0 +1,7 @@ +# MSYS-style PATH +export PATH=/c/CCStudio_v3.3/c6000/cgtools/bin:/c/Program\ Files/ActivePerl58/bin:$PATH + +# Windows-style variables +export C6X_C_DIR='C:\CCStudio_v3.3\c6000\cgtools\include;C:\CCStudio_v3.3\c6000\cgtools\lib' + +export PERL5LIB=C:/CCStudio_v3.3/bin/utilities/ccs_scripting diff --git a/c6x/fips_algvs.mak b/c6x/fips_algvs.mak new file mode 100644 index 0000000000..7f67927fbd --- /dev/null +++ b/c6x/fips_algvs.mak @@ -0,0 +1,14 @@ +CC=cl6x +CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H +OBJ_D=c6x/tmp +OUT_D=c6x + +all: $(OUT_D)/fips_algvs.out + +$(OBJ_D)/fips_algvs.obj: test/fips_algvs.c + $(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $< + +$(OUT_D)/fips_algvs.out: $(OBJ_D)/fips_algvs.obj $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd + $(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj + $(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd + $(OUT_D)/incore6x $@ || rm $@ diff --git a/c6x/fips_standalone_sha1 b/c6x/fips_standalone_sha1 new file mode 100755 index 0000000000..ea2268cb4e --- /dev/null +++ b/c6x/fips_standalone_sha1 @@ -0,0 +1,32 @@ +#!/usr/bin/env perl +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +unshift(@INC,$dir); +require "hmac_sha1.pl"; + +(!@ARV[0] && -f @ARGV[$#ARGV]) || die "usage: $0 [-verify] file"; + +$verify=shift if (@ARGV[0] eq "-verify"); + +sysopen(FD,@ARGV[0],0) || die "$!"; +binmode(FD); + +my $ctx = HMAC->Init("etaonrishdlcupfm"); + +while (read(FD,$blob,4*1024)) { $ctx->Update($blob); } + +close(FD); + +my $signature = unpack("H*",$ctx->Final()); + +print "HMAC-SHA1(@ARGV[0])= $signature\n"; + +if ($verify) { + open(FD,"<@ARGV[0].sha1") || die "$!"; + $line = ; + close(FD); + exit(0) if ($line =~ /HMAC\-SHA1\([^\)]*\)=\s*([0-9a-f]+)/i && + $1 eq $signature); + die "signature mismatch"; +} diff --git a/c6x/fipscanister.cmd b/c6x/fipscanister.cmd new file mode 100644 index 0000000000..a06ee15cb3 --- /dev/null +++ b/c6x/fipscanister.cmd @@ -0,0 +1,19 @@ +SECTIONS +{ + .text: + { + *(.fips_text:start) + *(.text) + *(.const:aes_asm) + *(.const:sha_asm) + *(.const:des_sptrans) + *(.switch) + *(.fips_text:end) + } + .const: + { + *(.fips_const:start) + *(.const) + *(.fips_const:end) + } +} diff --git a/c6x/hmac_sha1.pl b/c6x/hmac_sha1.pl new file mode 100644 index 0000000000..494f7e8569 --- /dev/null +++ b/c6x/hmac_sha1.pl @@ -0,0 +1,196 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2011 The OpenSSL Project. +# +###################################################################### +# +# SHA1 and HMAC in Perl by . +# +{ package SHA1; + use integer; + + { + ################################### SHA1 block code generator + my @V = ('$A','$B','$C','$D','$E'); + my $i; + + sub XUpdate { + my $ret; + $ret="(\$T=\$W[($i-16)%16]^\$W[($i-14)%16]^\$W[($i-8)%16]^\$W[($i-3)%16],\n\t"; + if ((1<<31)<<1) { + $ret.=" \$W[$i%16]=((\$T<<1)|(\$T>>31))&0xffffffff)\n\t "; + } else { + $ret.=" \$W[$i%16]=(\$T<<1)|((\$T>>31)&1))\n\t "; + } + } + sub tail { + my ($a,$b,$c,$d,$e)=@V; + my $ret; + if ((1<<31)<<1) { + $ret.="(($a<<5)|($a>>27));\n\t"; + $ret.="$b=($b<<30)|($b>>2); $e&=0xffffffff; #$b&=0xffffffff;\n\t"; + } else { + $ret.="(($a<<5)|($a>>27)&0x1f);\n\t"; + $ret.="$b=($b<<30)|($b>>2)&0x3fffffff;\n\t"; + } + $ret; + } + sub BODY_00_15 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=\$W[$i]+0x5a827999+((($c^$d)&$b)^$d)+".tail(); + } + sub BODY_16_19 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x5a827999+((($c^$d)&$b)^$d)+".tail(); + } + sub BODY_20_39 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x6ed9eba1+($b^$c^$d)+".tail(); + } + sub BODY_40_59 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x8f1bbcdc+(($b&$c)|(($b|$c)&$d))+".tail(); + } + sub BODY_60_79 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0xca62c1d6+($b^$c^$d)+".tail(); + } + + my $sha1_impl = + 'sub block { + my $self = @_[0]; + my @W = unpack("N16",@_[1]); + my ($A,$B,$C,$D,$E,$T) = @{$self->{H}}; + '; + + $sha1_impl.=' + $A &= 0xffffffff; + $B &= 0xffffffff; + ' if ((1<<31)<<1); + + for($i=0;$i<16;$i++){ $sha1_impl.=BODY_00_15(); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { $sha1_impl.=BODY_16_19(); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { $sha1_impl.=BODY_20_39(); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { $sha1_impl.=BODY_40_59(); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { $sha1_impl.=BODY_60_79(); unshift(@V,pop(@V)); } + + $sha1_impl.=' + $self->{H}[0]+=$A; $self->{H}[1]+=$B; $self->{H}[2]+=$C; + $self->{H}[3]+=$D; $self->{H}[4]+=$E; }'; + + #print $sha1_impl,"\n"; + eval($sha1_impl); # generate code + } + + sub Init { + my $class = shift; # multiple instances... + my $self = {}; + + bless $self,$class; + $self->{H} = [0x67452301,0xefcdab89,0x98badcfe,0x10325476,0xc3d2e1f0]; + $self->{N} = 0; + return $self; + } + + sub Update { + my $self = shift; + my $msg; + + foreach $msg (@_) { + my $len = length($msg); + my $num = length($self->{buf}); + my $off = 0; + + $self->{N} += $len; + + if (($num+$len)<64) + { $self->{buf} .= $msg; next; } + elsif ($num) + { $self->{buf} .= substr($msg,0,($off=64-$num)); + $self->block($self->{buf}); + } + + while(($off+64) <= $len) + { $self->block(substr($msg,$off,64)); + $off += 64; + } + + $self->{buf} = substr($msg,$off); + } + return $self; + } + + sub Final { + my $self = shift; + my $num = length($self->{buf}); + + $self->{buf} .= chr(0x80); $num++; + if ($num>56) + { $self->{buf} .= chr(0)x(64-$num); + $self->block($self->{buf}); + $self->{buf}=undef; + $num=0; + } + $self->{buf} .= chr(0)x(56-$num); + $self->{buf} .= pack("N2",($self->{N}>>29)&0x7,$self->{N}<<3); + $self->block($self->{buf}); + + return pack("N*",@{$self->{H}}); + } + + sub Selftest { + my $hash; + + $hash=SHA1->Init()->Update('abc')->Final(); + die "SHA1 test#1" if (unpack("H*",$hash) ne 'a9993e364706816aba3e25717850c26c9cd0d89d'); + + $hash=SHA1->Init()->Update('abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq')->Final(); + die "SHA1 test#2" if (unpack("H*",$hash) ne '84983e441c3bd26ebaae4aa1f95129e5e54670f1'); + + #$hash=SHA1->Init()->Update('a'x1000000)->Final(); + #die "SHA1 test#3" if (unpack("H*",$hash) ne '34aa973cd4c4daa4f61eeb2bdbad27316534016f'); + } +} + +{ package HMAC; + + sub Init { + my $class = shift; + my $key = shift; + my $self = {}; + + bless $self,$class; + + if (length($key)>64) { + $key = SHA1->Init()->Update($key)->Final(); + } + $key .= chr(0x00)x(64-length($key)); + + my @ikey = map($_^=0x36,unpack("C*",$key)); + ($self->{hash} = SHA1->Init())->Update(pack("C*",@ikey)); + $self->{okey} = pack("C*",map($_^=0x36^0x5c,@ikey)); + + return $self; + } + + sub Update { + my $self = shift; + $self->{hash}->Update(@_); + return $self; + } + + sub Final { + my $self = shift; + my $ihash = $self->{hash}->Final(); + return SHA1->Init()->Update($self->{okey},$ihash)->Final(); + } + + sub Selftest { + my $hmac; + + $hmac = HMAC->Init('0123456789:;<=>?@ABC')->Update('Sample #2')->Final(); + die "HMAC test" if (unpack("H*",$hmac) ne '0922d3405faa3d194f82a45830737d5cc6c75d24'); + } +} + +1; diff --git a/c6x/incore6x b/c6x/incore6x new file mode 100755 index 0000000000..be73aca2d9 --- /dev/null +++ b/c6x/incore6x @@ -0,0 +1,241 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2011 The OpenSSL Project. +# +# The script embeds fingerprint into TI-COFF executable object. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +unshift(@INC,$dir); +require "hmac_sha1.pl"; + +###################################################################### +# +# COFF symbol table parser by . The table entries +# are extended with offset within executable file... +# +{ package COFF; + use FileHandle; + + sub dup { my %copy=map {$_} @_; return \%copy; } + + sub Load { + my $class = shift; + my $self = {}; + my $FD = FileHandle->new(); # autoclose + + bless $self,$class; + + sysopen($FD,shift,0) or die "$!"; + binmode($FD); + + ################################################# + # read and parse COFF header... + # + read($FD,my $coff,22) or die "$!"; + + my %coff_header; + @coff_header{version,nsects,date,syms_off,nsyms,opt,flags,magic}= + unpack("v2V3v3",$coff); + + $!=42; # signal fipsld to revert to two-step link + die "not TI-COFF file" if ($coff_header{version} != 0xC2); + + my $big_endian = ($coff_header{flags}>>9)&1; # 0 or 1 + + my $strings; + my $symsize; + + ################################################# + # load strings table + # + seek($FD,$coff_header{syms_off}+18*$coff_header{nsyms},0) or die "$!"; + read($FD,$strings,4) or die "$!"; + $symsize = unpack("V",$strings); + read($FD,$strings,$symsize,4) or die "$!"; + + ################################################# + # read sections + # + my $i; + my @sections; + + # seek to section headers + seek($FD,22+@coff_header{opt},0) or die "$!"; + for ($i=0;$i<$coff_header{nsects};$i++) { + my %coff_shdr; + my $name; + + read($FD,my $section,48) or die "$!"; + + @coff_shdr{sh_name,sh_phaddr,sh_vaddr, + sh_size,sh_offset,sh_relocs,sh_reserved, + sh_relocoff,sh_lines,sh_flags} = + unpack("a8V9",$section); + + $name = $coff_shdr{sh_name}; + # see if sh_name is a an offset in $strings + my ($hi,$lo) = unpack("V2",$name); + if ($hi==0 && $lo<$symsize) { + $name = substr($strings,$lo,64); + } + $coff_shdr{sh_name} = (split(chr(0),$name))[0]; + + push(@sections,dup(%coff_shdr)); + } + + ################################################# + # load symbols table + # + seek($FD,$coff_header{syms_off},0) or die "$!"; + for ($i=0;$i<$coff_header{nsyms};$i++) { + my %coff_sym; + my $name; + + read($FD,my $blob,18) or die "$!"; + + @coff_sym{st_name,st_value,st_shndx,reserved,class,aux} = + unpack("a8Vv2C2",$blob); + + # skip aux entries + if ($coff_sym{aux}) { + seek($FD,18*$coff_sym{aux},1) or die "$!"; + $i+=$coff_sym{aux}; + } + + $name = $coff_sym{st_name}; + # see if st_name is a an offset in $strings + my ($hi,$lo) = unpack("V2",$name); + if ($hi==0 && $lo<$symsize) { + $name = substr($strings,$lo,64); + } + $coff_sym{st_name} = $name = (split(chr(0),$name))[0]; + + my $st_secn = $coff_sym{st_shndx}-1; + if ($st_secn>=0 && $st_secn<=$#sections + && @sections[$st_secn]->{sh_offset} + && $name =~ m/^_[a-z]+/i) { + # synthesize st_offset, ... + $coff_sym{st_offset} = $coff_sym{st_value} + - @sections[$st_secn]->{sh_vaddr} + + @sections[$st_secn]->{sh_offset}; + $coff_sym{st_section} = @sections[$st_secn]->{sh_name}; + # ... and add to lookup table + $self->{symbols}{$name} = dup(%coff_sym); + } + } + + return $self; + } + + sub Lookup { + my $self = shift; + my $name = shift; + return $self->{symbols}{"_$name"}; + } + + sub Traverse { + my $self = shift; + my $code = shift; + + if (ref($code) eq 'CODE') { + for (keys(%{$self->{symbols}})) { &$code($self->{symbols}{$_}); } + } + } +} + +###################################################################### +# +# main() +# +my $legacy_mode; + +if ($#ARGV<0 || ($#ARGV>0 && !($legacy_mode=(@ARGV[0] =~ /^\-(dso|exe)$/)))) { + print STDERR "usage: $0 [-dso|-exe] ti-coff-binary\n"; + exit(1); +} + +$exe = COFF->Load(@ARGV[$#ARGV]); + +$FIPS_text_start = $exe->Lookup("FIPS_text_start") or die; +$FIPS_text_end = $exe->Lookup("FIPS_text_end") or die; +$FIPS_rodata_start = $exe->Lookup("FIPS_rodata_start") or die; +$FIPS_rodata_end = $exe->Lookup("FIPS_rodata_end") or die; +$FIPS_signature = $exe->Lookup("FIPS_signature") or die; + +# new cross-compile support +$FIPS_text_startX = $exe->Lookup("FIPS_text_startX"); +$FIPS_text_endX = $exe->Lookup("FIPS_text_endX"); + +if (!$legacy_mode) { + if (!$FIPS_text_startX || !$FIPS_text_endX) { + print STDERR "@ARGV[$#ARGV] is not cross-compiler aware.\n"; + exit(42); # signal fipsld to revert to two-step link + } + + $FINGERPRINT_ascii_value + = $exe->Lookup("FINGERPRINT_ascii_value"); +} +if ($FIPS_text_startX && $FIPS_text_endX) { + $FIPS_text_start = $FIPS_text_startX; + $FIPS_text_end = $FIPS_text_endX; +} + +sysopen(FD,@ARGV[$#ARGV],$legacy_mode?0:2) or die "$!"; # 2 is read/write +binmode(FD); + +sub HMAC_Update { + my ($hmac,$off,$len) = @_; + my $blob; + + seek(FD,$off,0) or die "$!"; + read(FD,$blob,$len) or die "$!"; + $$hmac->Update($blob); +} + +# fips/fips.c:FIPS_incore_fingerprint's Perl twin +# +sub FIPS_incore_fingerprint { + my $p1 = $FIPS_text_start->{st_offset}; + my $p2 = $FIPS_text_end->{st_offset}; + my $p3 = $FIPS_rodata_start->{st_offset}; + my $p4 = $FIPS_rodata_end->{st_offset}; + my $sig = $FIPS_signature->{st_offset}; + my $ctx = HMAC->Init("etaonrishdlcupfm"); + + # detect overlapping regions + if ($p1<=$p3 && $p2>=$p3) { + $p3 = $p1; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0; + } elsif ($p3<=$p1 && $p4>=$p1) { + $p3 = $p3; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0; + } + + if ($p1) { + HMAC_Update (\$ctx,$p1,$p2-$p1); + } + + if ($sig>=$p3 && $sig<$p4) { + # "punch" hole + HMAC_Update(\$ctx,$p3,$sig-$p3); + $p3 = $sig+20; + HMAC_Update(\$ctx,$p3,$p4-$p3); + } else { + HMAC_Update(\$ctx,$p3,$p4-$p3); + } + + return $ctx->Final(); +} + +$fingerprint = FIPS_incore_fingerprint(); + +if ($legacy_mode) { + print unpack("H*",$fingerprint); +} elsif ($FINGERPRINT_ascii_value) { + seek(FD,$FINGERPRINT_ascii_value->{st_offset},0) or die "$!"; + print FD unpack("H*",$fingerprint) or die "$!"; +} else { + seek(FD,$FIPS_signature->{st_offset},0) or die "$!"; + print FD $fingerprint or die "$!"; +} + +close (FD); diff --git a/c6x/run6x b/c6x/run6x new file mode 100755 index 0000000000..aecfabeb04 --- /dev/null +++ b/c6x/run6x @@ -0,0 +1,43 @@ +#!/usr/bin/env perl + +$exe = @ARGV[0]; +$exe .= ".out" if (! -f $exe); +die if (! -f $exe); + +use CCS_SCRIPTING_PERL; + +my $studio=new CCS_SCRIPTING_PERL::CCS_Scripting(); + +$studio->CCSOpenNamed("*","*",1); # connect to board +$studio->TargetReset(); + +print "loading $exe\n"; +$studio->ProgramLoad($exe); + +sub write_string { + my ($studio,$addr,$str) = @_; + my $len = length($str); + my $i; + + for ($i=0; $i<$len; $i++) { + $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,vec($str,$i,8)); + } + $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,0); + + return $i+1; +} + +$addr= $studio->SymbolGetAddress("__c_args"); +printf "setting up __c_args at 0x%X\n",$addr;#\n"; + +$studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr,32,$#ARGV+1); + +for ($i=0,$strings=$addr+($#ARGV+3)*4; $i<=$#ARGV; $i++) { + $off = write_string($studio,$strings,@ARGV[$i]); + $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,$strings); + $strings += $off; +} +$studio->MemoryWrite($SCC_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,0); + +print "running...\n"; +$studio->TargetRun(); diff --git a/c6x/run6x.js b/c6x/run6x.js new file mode 100755 index 0000000000..6d94949751 --- /dev/null +++ b/c6x/run6x.js @@ -0,0 +1,91 @@ +#!/usr/bin/env dss.sh +// +// Debug Server Scripting C6x launcher. +// + +importPackage(Packages.com.ti.debug.engine.scripting); +importPackage(Packages.com.ti.ccstudio.scripting.environment); +importPackage(Packages.java.lang); + +if (arguments.length == 0) { + // Extract script name from eclipse + var regex = new RegExp("-dss\\.rhinoArgs\n(.*)"); + var matches = regex.exec(environment["eclipse.commands"]); + + System.err.println("Usage: " + matches[1] + " executable [args]"); + System.err.println(); + System.err.println("You're also required to set CCSTARGETCONFIG " + + "environment variable to appoint"); + System.err.println("proper .ccxml file, customarily one of " + + "$HOME/ti/CCSTargetConfigurations/*.ccxml"); + quit(1); +} + +try { + var prog = arguments[0]; + var script = ScriptingEnvironment.instance(); + + var debugServer = script.getServer("DebugServer.1"); + + // CCSTARGETCONFIG environment variable should point at proper .ccxml, + // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml. + debugServer.setConfig(System.getenv("CCSTARGETCONFIG")); + + var debugSession = debugServer.openSession("*", "*"); + + // Redirect GEL output to |prog|.gel file, so that it doesn't clobber + // standard output from the program... + var dot = prog.lastIndexOf("."); + var gel_out = prog + ".gel"; + if (dot > 0) { + gel_out = prog.substr(0,dot) + ".gel"; + } + debugSession.expression.evaluate('GEL_EnableFileOutput("' + + gel_out + '", 0, 0)'); + + debugSession.target.connect(); + + // It should be noted that "current working directory" for program + // executed on the target system is one where |prog| resides, and + // not where script executed [as one would expect]... + debugSession.memory.loadProgram(prog, arguments); + + // Pull exit()'s address and set breakpoint, then just execute till + // it's reached... + var exitAddr = debugSession.symbol.getAddress("exit"); + debugSession.breakpoint.add(exitAddr); + + while (1) { + debugSession.target.run(); + + var PC = debugSession.expression.evaluate("PC"); + if (PC == exitAddr) { + break; + } + } + + // Snatch value passed to exit(), so that it can be passed down to + // shell as exit code from this script... + var exitCode = debugSession.expression.evaluate("A4"); + + // Last run to termination... + debugSession.target.run(); + // Clean up... + debugSession.terminate(); + debugServer.stop(); + + // It should be noted that there is kind of a bug in C6x run-time. + // Return value from main() is not passed to last implicit exit() + // call [as it would on other systems], but instead constant 1 is + // passed, which conventionally indicates an error. So that if one + // wants to pass specific exit code, or even 0 indicating "success", + // one has to call exit() explicitly instead of relying on value + // returned by main()... + quit(exitCode); + +} catch (e) { + // We catch everything, because default handler terminates script with + // "success" exit code upon exception... + System.err.println(e.rhinoException); + quit(139); +} diff --git a/config b/config index d2b155aa44..36ab9f2ef6 100755 --- a/config +++ b/config @@ -219,7 +219,11 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in ;; NetBSD:*:*:*386*) - echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0 + if [ -z ${CROSS_COMPILE} ]; then + echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0 + else + echo "${MACHINE}-whatever-netbsd"; exit 0 + fi ;; NetBSD:*) @@ -371,6 +375,10 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in echo "nsr-tandem-nsk"; exit 0; ;; + vxworks:kernel*) + echo "${MACHINE}-kernel-vxworks"; exit 0; + ;; + vxworks*) echo "${MACHINE}-whatever-vxworks"; exit 0; ;; @@ -535,10 +543,14 @@ case "$GUESSOS" in #fi OUT="irix-mips3-$CC" ;; + mips32be-*-linux2) + OUT=linux-mips32be + options="$options threads shared zlib-dynamic" + ;; ppc-apple-rhapsody) OUT="rhapsody-ppc-cc" ;; ppc-apple-darwin*) ISA64=`(sysctl -n hw.optional.64bitops) 2>/dev/null` - if [ "$ISA64" = "1" ]; then + if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then echo "WARNING! If you wish to build 64-bit library, then you have to" echo " invoke './Configure darwin64-ppc-cc' *manually*." if [ "$TEST" = "false" -a -t 1 ]; then @@ -546,10 +558,14 @@ case "$GUESSOS" in (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 fi fi - OUT="darwin-ppc-cc" ;; + if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then + OUT="darwin64-ppc-cc" + else + OUT="darwin-ppc-cc" + fi ;; i?86-apple-darwin*) ISA64=`(sysctl -n hw.optional.x86_64) 2>/dev/null` - if [ "$ISA64" = "1" ]; then + if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then echo "WARNING! If you wish to build 64-bit library, then you have to" echo " invoke './Configure darwin64-x86_64-cc' *manually*." if [ "$TEST" = "false" -a -t 1 ]; then @@ -557,7 +573,21 @@ case "$GUESSOS" in (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 fi fi - OUT="darwin-i386-cc" ;; + if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then + OUT="darwin64-x86_64-cc" + else + OUT="darwin-i386-cc" + fi ;; + armv6+7-*-iphoneos) + options="$options -arch%20armv6 -arch%20armv7" + OUT="iphoneos-cross" ;; + *-*-iphoneos) + options="$options -arch%20${MACHINE}" + OUT="iphoneos-cross" ;; + armv7-*-ios) + OUT="ios-cross" ;; + arm64-*-ios*) + OUT="ios64-cross" ;; alpha-*-linux2) ISA=`awk '/cpu model/{print$4;exit(0);}' /proc/cpuinfo` case ${ISA:-generic} in @@ -583,6 +613,7 @@ case "$GUESSOS" in ;; ppc-*-linux2) OUT="linux-ppc" ;; ppc60x-*-vxworks*) OUT="vxworks-ppc60x" ;; + ppcgen-kernel-vxworks*) OUT="vxworks-ppcgen-kernel" ;; ppcgen-*-vxworks*) OUT="vxworks-ppcgen" ;; pentium-*-vxworks*) OUT="vxworks-pentium" ;; simlinux-*-vxworks*) OUT="vxworks-simlinux" ;; @@ -664,7 +695,7 @@ case "$GUESSOS" in sun4[uv]*-*-solaris2) OUT="solaris-sparcv9-$CC" ISA64=`(isalist) 2>/dev/null | grep sparcv9` - if [ "$ISA64" != "" ]; then + if [ "$ISA64" != "" -a "$KERNEL_BITS" = "" ]; then if [ "$CC" = "cc" -a $CCVER -ge 50 ]; then echo "WARNING! If you wish to build 64-bit library, then you have to" echo " invoke './Configure solaris64-sparcv9-cc' *manually*." @@ -694,13 +725,16 @@ case "$GUESSOS" in fi fi fi + if [ "$ISA64" != "" -a "$KERNEL_BITS" = "64" ]; then + OUT="solaris64-sparcv9-$CC" + fi ;; sun4m-*-solaris2) OUT="solaris-sparcv8-$CC" ;; sun4d-*-solaris2) OUT="solaris-sparcv8-$CC" ;; sun4*-*-solaris2) OUT="solaris-sparcv7-$CC" ;; *86*-*-solaris2) ISA64=`(isalist) 2>/dev/null | grep amd64` - if [ "$ISA64" != "" ]; then + if [ "$ISA64" != "" -a ${KERNEL_BITS:-64} -eq 64 ]; then OUT="solaris64-x86_64-$CC" else OUT="solaris-x86-$CC" @@ -717,17 +751,23 @@ case "$GUESSOS" in sparc64-*-*bsd*) OUT="BSD-sparc64" ;; ia64-*-*bsd*) OUT="BSD-ia64" ;; amd64-*-*bsd*) OUT="BSD-x86_64" ;; - *86*-*-*bsd*) # mimic ld behaviour when it's looking for libc... - if [ -L /usr/lib/libc.so ]; then # [Free|Net]BSD - libc=/usr/lib/libc.so - else # OpenBSD - # ld searches for highest libc.so.* and so do we - libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null` - fi - case "`(file -L $libc) 2>/dev/null`" in - *ELF*) OUT="BSD-x86-elf" ;; - *) OUT="BSD-x86"; options="$options no-sse2" ;; - esac ;; + *86*-*-*bsd*) if [ -z ${CROSS_COMPILE} ]; then + # mimic ld behaviour when it's looking for libc... + if [ -L /usr/lib/libc.so ]; then # [Free|Net]BSD + libc=/usr/lib/libc.so + else # OpenBSD + # ld searches for highest libc.so.* and so do we + libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null` + fi + echo "libc = $libc" + case "`(file -L $libc) 2>/dev/null`" in + *ELF*) OUT="BSD-x86-elf" ;; + *) OUT="BSD-x86"; options="$options no-sse2" ;; + esac + else + OUT="BSD-x86-elf" + fi;; + ppc85xx-*-*bsd*) OUT="BSD-ppc85xx" ;; # MPC85XX has no hardware FP accelerator *-*-*bsd*) OUT="BSD-generic32" ;; *-*-osf) OUT="osf1-alpha-cc" ;; @@ -825,6 +865,7 @@ case "$GUESSOS" in *-*-qnx6) OUT="QNX6" ;; x86-*-android|i?86-*-android) OUT="android-x86" ;; armv[7-9]*-*-android) OUT="android-armv7" ;; + aarch64-*-android) OUT="android64-aarch64" ;; *) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;; esac diff --git a/crypto/Makefile b/crypto/Makefile index 22cb2a5013..7304684f76 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -87,6 +87,7 @@ ppccpuid.s: ppccpuid.pl; $(PERL) ppccpuid.pl $(PERLASM_SCHEME) $@ pariscid.s: pariscid.pl; $(PERL) pariscid.pl $(PERLASM_SCHEME) $@ alphacpuid.s: alphacpuid.pl $(PERL) $< | $(CC) -E - | tee $@ > /dev/null +arm64cpuid.S: arm64cpuid.pl; $(PERL) arm64cpuid.pl $(PERLASM_SCHEME) > $@ subdirs: @target=all; $(RECURSIVE_MAKE) diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile index 8edd358bd3..34760d174a 100644 --- a/crypto/aes/Makefile +++ b/crypto/aes/Makefile @@ -71,6 +71,8 @@ aes-sparcv9.s: asm/aes-sparcv9.pl aes-ppc.s: asm/aes-ppc.pl $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ +aesp8-ppc.s: asm/aesp8-ppc.pl + $(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@ aes-parisc.s: asm/aes-parisc.pl $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ @@ -78,6 +80,10 @@ aes-parisc.s: asm/aes-parisc.pl aes-mips.S: asm/aes-mips.pl $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@ +aesv8-armx.S: asm/aesv8-armx.pl + $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@ +aesv8-armx.o: aesv8-armx.S + # GNU make "catch all" aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ aes-armv4.o: aes-armv4.S diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index 55b6e04b67..ed5125827b 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -32,8 +32,20 @@ # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~21.5 cycles per byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $s0="r0"; $s1="r1"; @@ -171,7 +183,12 @@ AES_encrypt: stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 +#ifdef __APPLE__ + mov $tbl,#AES_encrypt-AES_Te + sub $tbl,r3,$tbl @ Te +#else sub $tbl,r3,#AES_encrypt-AES_Te @ Te +#endif #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -425,7 +442,12 @@ AES_set_encrypt_key: bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} +#ifdef __APPLE__ + mov $tbl,#AES_set_encrypt_key-AES_Te-1024 + sub $tbl,r3,$tbl @ Te4 +#else sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif mov $rounds,r0 @ inp mov lr,r1 @ bits @@ -886,7 +908,12 @@ AES_decrypt: stmdb sp!,{r1,r4-r12,lr} mov $rounds,r0 @ inp mov $key,r2 +#ifdef __APPLE__ + mov $tbl,#AES_decrypt-AES_Td + sub $tbl,r3,$tbl @ Td +#else sub $tbl,r3,#AES_decrypt-AES_Td @ Td +#endif #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... diff --git a/crypto/aes/asm/aes-c64x.pl b/crypto/aes/asm/aes-c64x.pl new file mode 100644 index 0000000000..0817128c1b --- /dev/null +++ b/crypto/aes/asm/aes-c64x.pl @@ -0,0 +1,1375 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [Endian-neutral] AES for C64x. +# +# Even though loops are scheduled for 13 cycles, and thus expected +# performance is ~8.5 cycles per byte processed with 128-bit key, +# measured performance turned to be ~10 cycles per byte. Discrepancy +# must be caused by limitations of L1D memory banking(*), see SPRU871 +# TI publication for further details. If any consolation it's still +# ~20% faster than TI's linear assembly module anyway... Compared to +# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this +# code is 3.75x faster and almost 3x smaller (tables included). +# +# (*) This means that there might be subtle correlation between data +# and timing and one can wonder if it can be ... attacked:-( +# On the other hand this also means that *if* one chooses to +# implement *4* T-tables variant [instead of 1 T-table as in +# this implementation, or in addition to], then one ought to +# *interleave* them. Even though it complicates addressing, +# references to interleaved tables would be guaranteed not to +# clash. I reckon that it should be possible to break 8 cycles +# per byte "barrier," i.e. improve by ~20%, naturally at the +# cost of 8x increased pressure on L1D. 8x because you'd have +# to interleave both Te and Td tables... + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($TEA,$TEB)=("A5","B5"); +($KPA,$KPB)=("A3","B1"); +@K=("A6","B6","A7","B7"); +@s=("A8","B8","A9","B9"); +@Te0=@Td0=("A16","B16","A17","B17"); +@Te1=@Td1=("A18","B18","A19","B19"); +@Te2=@Td2=("A20","B20","A21","B21"); +@Te3=@Td3=("A22","B22","A23","B23"); + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg AES_encrypt,_AES_encrypt + .asg AES_decrypt,_AES_decrypt + .asg AES_set_encrypt_key,_AES_set_encrypt_key + .asg AES_set_decrypt_key,_AES_set_decrypt_key + .asg AES_ctr32_encrypt,_AES_ctr32_encrypt + .endif + + .asg B3,RA + .asg A4,INP + .asg B4,OUT + .asg A6,KEY + .asg A4,RET + .asg B15,SP + + .eval 24,EXT0 + .eval 16,EXT1 + .eval 8,EXT2 + .eval 0,EXT3 + .eval 8,TBL1 + .eval 16,TBL2 + .eval 24,TBL3 + + .if .BIG_ENDIAN + .eval 24-EXT0,EXT0 + .eval 24-EXT1,EXT1 + .eval 24-EXT2,EXT2 + .eval 24-EXT3,EXT3 + .eval 32-TBL1,TBL1 + .eval 32-TBL2,TBL2 + .eval 32-TBL3,TBL3 + .endif + + .global _AES_encrypt +_AES_encrypt: + .asmfunc + MVK 1,B2 +__encrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Te,__encrypt),$TEA +|| ADDKPC __encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Te,__encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Te-__encrypt),$TEA +|| ADDKPC __encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Te-__encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Te0[0] ; zero round key +|| LDW *$KPB++[2],$Te0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Te + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Te) + LDW *$KPA++[2],$Te0[2] +|| LDW *$KPB++[2],$Te0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Te0[0],$s[0],$s[0] +|| XOR $Te0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + + LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +|| EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] +|| SUB B0,1,B0 +;;==================================================================== +enc_loop?: + LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| ROTL $Te1[1],TBL1,$Te3[0] ; t0 +|| ROTL $Te3[0],TBL3,$Te1[1] ; t1 +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| ROTL $Te3[1],TBL3,$Te1[0] ; t2 +|| ROTL $Te1[0],TBL1,$Te3[1] ; t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] +|| [B0] SUB B0,1,B0 + LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| ROTL $Te2[2],TBL2,$Te2[2] ; t0 +|| ROTL $Te2[3],TBL2,$Te2[3] ; t1 +|| XOR $K[0],$Te3[0],$s[0] +|| XOR $K[1],$Te1[1],$s[1] +|| [B0] BNOP enc_loop? + ROTL $Te3[3],TBL3,$Te1[2] ; t0 +|| ROTL $Te1[2],TBL1,$Te3[3] ; t1 +|| XOR $K[2],$Te1[0],$s[2] +|| XOR $K[3],$Te3[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Te2[0],TBL2,$Te2[0] ; t2 +|| ROTL $Te2[1],TBL2,$Te2[1] ; t3 +|| XOR $s[0],$Te2[2],$s[0] +|| XOR $s[1],$Te2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Te1[3],TBL1,$Te3[2] ; t2 +|| ROTL $Te3[2],TBL3,$Te1[3] ; t3 +|| XOR $s[0],$Te1[2],$s[0] +|| XOR $s[1],$Te3[3],$s[1] + XOR $s[2],$Te2[0],$s[2] +|| XOR $s[3],$Te2[1],$s[3] +|| XOR $s[0],$Te0[0],$s[0] +|| XOR $s[1],$Te0[1],$s[1] + XOR $s[2],$Te3[2],$s[2] +|| XOR $s[3],$Te1[3],$s[3] +|| EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] +||[!B0] ADD ${TEA},A0,${TEA} ; point to Te4 +||[!B0] ADD ${TEB},A0,${TEB} +;;==================================================================== + LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 + + .if .BIG_ENDIAN + PACK2 $Te0[0],$Te1[1],$Te0[0] +|| PACK2 $Te0[1],$Te1[2],$Te0[1] + PACK2 $Te2[2],$Te3[3],$Te2[2] +|| PACK2 $Te2[3],$Te3[0],$Te2[3] + PACKL4 $Te0[0],$Te2[2],$Te0[0] +|| PACKL4 $Te0[1],$Te2[3],$Te0[1] + XOR $K[0],$Te0[0],$Te0[0] ; s[0] +|| XOR $K[1],$Te0[1],$Te0[1] ; s[1] + + PACK2 $Te0[2],$Te1[3],$Te0[2] +|| PACK2 $Te0[3],$Te1[0],$Te0[3] + PACK2 $Te2[0],$Te3[1],$Te2[0] +|| PACK2 $Te2[1],$Te3[2],$Te2[1] +|| BNOP RA + PACKL4 $Te0[2],$Te2[0],$Te0[2] +|| PACKL4 $Te0[3],$Te2[1],$Te0[3] + XOR $K[2],$Te0[2],$Te0[2] ; s[2] +|| XOR $K[3],$Te0[3],$Te0[3] ; s[3] + + MV $Te0[0],A9 +|| MV $Te0[1],A8 + MV $Te0[2],B9 +|| MV $Te0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Te1[1],$Te0[0],$Te1[1] +|| PACK2 $Te1[2],$Te0[1],$Te1[2] + PACK2 $Te3[3],$Te2[2],$Te3[3] +|| PACK2 $Te3[0],$Te2[3],$Te3[0] + PACKL4 $Te3[3],$Te1[1],$Te1[1] +|| PACKL4 $Te3[0],$Te1[2],$Te1[2] + XOR $K[0],$Te1[1],$Te1[1] ; s[0] +|| XOR $K[1],$Te1[2],$Te1[2] ; s[1] + + PACK2 $Te1[3],$Te0[2],$Te1[3] +|| PACK2 $Te1[0],$Te0[3],$Te1[0] + PACK2 $Te3[1],$Te2[0],$Te3[1] +|| PACK2 $Te3[2],$Te2[1],$Te3[2] +|| BNOP RA + PACKL4 $Te3[1],$Te1[3],$Te1[3] +|| PACKL4 $Te3[2],$Te1[0],$Te1[0] + XOR $K[2],$Te1[3],$Te1[3] ; s[2] +|| XOR $K[3],$Te1[0],$Te1[0] ; s[3] + + MV $Te1[1],A8 +|| MV $Te1[2],A9 + MV $Te1[3],B8 +|| MV $Te1[0],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc + + .global _AES_decrypt +_AES_decrypt: + .asmfunc + MVK 1,B2 +__decrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Td,__decrypt),$TEA +|| ADDKPC __decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Td,__decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Td-__decrypt),$TEA +|| ADDKPC __decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Td-__decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Td0[0] ; zero round key +|| LDW *$KPB++[2],$Td0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Td + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Td) + LDW *$KPA++[2],$Td0[2] +|| LDW *$KPB++[2],$Td0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Td0[0],$s[0],$s[0] +|| XOR $Td0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + + LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +|| EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] +|| SUB B0,1,B0 +;;==================================================================== +dec_loop?: + LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| ROTL $Td3[1],TBL3,$Td1[0] ; t0 +|| ROTL $Td1[0],TBL1,$Td3[1] ; t1 +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| ROTL $Td1[1],TBL1,$Td3[0] ; t2 +|| ROTL $Td3[0],TBL3,$Td1[1] ; t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] +|| [B0] SUB B0,1,B0 + LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 +|| ROTL $Td2[2],TBL2,$Td2[2] ; t0 +|| ROTL $Td2[3],TBL2,$Td2[3] ; t1 +|| XOR $K[0],$Td1[0],$s[0] +|| XOR $K[1],$Td3[1],$s[1] +|| [B0] BNOP dec_loop? + ROTL $Td1[3],TBL1,$Td3[2] ; t0 +|| ROTL $Td3[2],TBL3,$Td1[3] ; t1 +|| XOR $K[2],$Td3[0],$s[2] +|| XOR $K[3],$Td1[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Td2[0],TBL2,$Td2[0] ; t2 +|| ROTL $Td2[1],TBL2,$Td2[1] ; t3 +|| XOR $s[0],$Td2[2],$s[0] +|| XOR $s[1],$Td2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Td3[3],TBL3,$Td1[2] ; t2 +|| ROTL $Td1[2],TBL1,$Td3[3] ; t3 +|| XOR $s[0],$Td3[2],$s[0] +|| XOR $s[1],$Td1[3],$s[1] + XOR $s[2],$Td2[0],$s[2] +|| XOR $s[3],$Td2[1],$s[3] +|| XOR $s[0],$Td0[0],$s[0] +|| XOR $s[1],$Td0[1],$s[1] + XOR $s[2],$Td1[2],$s[2] +|| XOR $s[3],$Td3[3],$s[3] +|| EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] +||[!B0] ADD ${TEA},A0,${TEA} ; point to Td4 +||[!B0] ADD ${TEB},A0,${TEB} +;;==================================================================== + LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 + + .if .BIG_ENDIAN + PACK2 $Td0[0],$Td1[3],$Td0[0] +|| PACK2 $Td0[1],$Td1[0],$Td0[1] + PACK2 $Td2[2],$Td3[1],$Td2[2] +|| PACK2 $Td2[3],$Td3[2],$Td2[3] + PACKL4 $Td0[0],$Td2[2],$Td0[0] +|| PACKL4 $Td0[1],$Td2[3],$Td0[1] + XOR $K[0],$Td0[0],$Td0[0] ; s[0] +|| XOR $K[1],$Td0[1],$Td0[1] ; s[1] + + PACK2 $Td0[2],$Td1[1],$Td0[2] +|| PACK2 $Td0[3],$Td1[2],$Td0[3] + PACK2 $Td2[0],$Td3[3],$Td2[0] +|| PACK2 $Td2[1],$Td3[0],$Td2[1] +|| BNOP RA + PACKL4 $Td0[2],$Td2[0],$Td0[2] +|| PACKL4 $Td0[3],$Td2[1],$Td0[3] + XOR $K[2],$Td0[2],$Td0[2] ; s[2] +|| XOR $K[3],$Td0[3],$Td0[3] ; s[3] + + MV $Td0[0],A9 +|| MV $Td0[1],A8 + MV $Td0[2],B9 +|| MV $Td0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Td1[3],$Td0[0],$Td1[3] +|| PACK2 $Td1[0],$Td0[1],$Td1[0] + PACK2 $Td3[1],$Td2[2],$Td3[1] +|| PACK2 $Td3[2],$Td2[3],$Td3[2] + PACKL4 $Td3[1],$Td1[3],$Td1[3] +|| PACKL4 $Td3[2],$Td1[0],$Td1[0] + XOR $K[0],$Td1[3],$Td1[3] ; s[0] +|| XOR $K[1],$Td1[0],$Td1[0] ; s[1] + + PACK2 $Td1[1],$Td0[2],$Td1[1] +|| PACK2 $Td1[2],$Td0[3],$Td1[2] + PACK2 $Td3[3],$Td2[0],$Td3[3] +|| PACK2 $Td3[0],$Td2[1],$Td3[0] +|| BNOP RA + PACKL4 $Td3[3],$Td1[1],$Td1[1] +|| PACKL4 $Td3[0],$Td1[2],$Td1[2] + XOR $K[2],$Td1[1],$Td1[1] ; s[2] +|| XOR $K[3],$Td1[2],$Td1[2] ; s[3] + + MV $Td1[3],A8 +|| MV $Td1[0],A9 + MV $Td1[1],B8 +|| MV $Td1[2],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc +___ +{ +my @K=(@K,@s); # extended key +my @Te4=map("B$_",(16..19)); + +my @Kx9=@Te0; # used in AES_set_decrypt_key +my @KxB=@Te1; +my @KxD=@Te2; +my @KxE=@Te3; + +$code.=<<___; + .asg OUT,BITS + + .global _AES_set_encrypt_key +_AES_set_encrypt_key: +__set_encrypt_key: + .asmfunc + MV INP,A0 +|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8 +|| MV KEY,A1 + [!A0] B RA +||[!A0] MVK -1,RET +||[!A0] MVK 1,A1 ; only one B RA + [!A1] B RA +||[!A1] MVK -1,RET +||[!A1] MVK 0,A0 +|| MVK 0,B0 +|| MVK 0,A1 + [A0] LDNDW *INP++,A9:A8 +|| [A0] CMPEQ 4,BITS,B0 +|| [A0] CMPLT 3,BITS,A1 + [B0] B key128? +|| [A1] LDNDW *INP++,B9:B8 +|| [A0] CMPEQ 6,BITS,B0 +|| [A0] CMPLT 5,BITS,A1 + [B0] B key192? +|| [A1] LDNDW *INP++,B17:B16 +|| [A0] CMPEQ 8,BITS,B0 +|| [A0] CMPLT 7,BITS,A1 + [B0] B key256? +|| [A1] LDNDW *INP++,B19:B18 + + .if __TI_EABI__ + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA +|| [A0] ADDKPC __set_encrypt_key,B6 + [A0] MVKH \$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .else + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL (AES_Te4-__set_encrypt_key),$TEA +|| [A0] ADDKPC __set_encrypt_key,B6 + [A0] MVKH (AES_Te4-__set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .endif + NOP + NOP + + BNOP RA,5 +|| MVK -2,RET ; unknown bit length +|| MVK 0,B0 ; redundant +;;==================================================================== +;;==================================================================== +key128?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$Te4[2] +|| MV B8,$K[3] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$Te4[2] +|| MV B9,$K[3] + .endif + + MVK 256,A0 +|| MVK 8,B0 + + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop128?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[2] +|| EXTU $K[3],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[3],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| BDEC loop128?,B0 + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| BDEC loop128?,B0 + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] +;;==================================================================== + BNOP RA + MV $Te4[2],$K[2] +|| STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 10,B0 ; rounds + STW B0,*++${KPB}[15] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key192?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$Te4[2] +|| MV B16,$K[5] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$Te4[2] +|| MV B17,$K[5] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop192?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[4] +|| EXTU $K[5],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[5],A0 +|| EXTU $K[5],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[5],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + BDEC loop192?,B0 +|| XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + MV $Te4[2],$K[2] +|| XOR $K[3],$K[4],$Te4[2] ; K[4] + XOR $Te4[2],$K[5],$K[5] ; K[5] +;;==================================================================== + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 12,B0 ; rounds + STW B0,*++${KPB}[7] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key256?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$K[4] +|| MV B16,$K[5] +|| MV B19,$Te4[2] +|| MV B18,$K[7] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$K[4] +|| MV B17,$K[5] +|| MV B18,$Te4[2] +|| MV B19,$K[7] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop256?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[6] +|| EXTU $K[7],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[7],A0 +|| EXTU $K[7],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[7],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + STW $K[6],*$KPA++[2] +|| STW $K[7],*$KPB++[2] +|| XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] +||[!B0] B done256? + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] +||[!B0] B done256? + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + + MV $Te4[2],$K[2] +|| [B0] EXTU $K[3],EXT0,24,$Te4[0] +|| [B0] SUB B0,1,B0 + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT1,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT2,24,A0 +|| EXTU $K[3],EXT3,24,$Te4[3] + + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + NOP 3 + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + NOP 3 + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + + XOR $Te4[3],$K[4],$Te4[0] ; K[4] + XOR $Te4[0],$K[5],$K[5] ; K[5] + MV $Te4[0],$K[4] +|| XOR $K[5],$K[6],$Te4[2] ; K[6] + XOR $Te4[2],$K[7],$K[7] ; K[7] +;;==================================================================== +done256?: + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 14,B0 ; rounds + STW B0,*--${KPB}[1] + MVK 0,RET + .endasmfunc + + .global _AES_set_decrypt_key +_AES_set_decrypt_key: + .asmfunc + B __set_encrypt_key ; guarantee local call + MV KEY,B30 ; B30 is not modified + MV RA, B31 ; B31 is not modified + ADDKPC ret?,RA,2 +ret?: ; B0 holds rounds or zero + [!B0] BNOP B31 ; return if zero + [B0] SHL B0,4,A0 ; offset to last round key + [B0] SHRU B0,1,B2 + [B0] SUB B2,2,B2 +|| [B0] MVK 0x0000001B,B3 ; AES polynomial + [B0] MVKH 0x07000000,B3 +|| [B0] MV B30,$KPA + [B0] ADD B30,A0,$KPB +|| [B0] MVK 16,A0 ; sizeof(round key) +;;==================================================================== +flip_loop?: + LDW *${KPA}[0],A16 +|| LDW *${KPB}[0],B16 + LDW *${KPA}[1],A17 +|| LDW *${KPB}[1],B17 + LDW *${KPA}[2],A18 +|| LDW *${KPB}[2],B18 + LDW *${KPA}[3],A19 +|| ADD $KPA,A0,$KPA +|| LDW *${KPB}[3],B19 +|| SUB $KPB,A0,$KPB +|| BDEC flip_loop?,B2 + NOP + STW B16,*${KPA}[-4] +|| STW A16,*${KPB}[4] + STW B17,*${KPA}[-3] +|| STW A17,*${KPB}[5] + STW B18,*${KPA}[-2] +|| STW A18,*${KPB}[6] + STW B19,*${KPA}[-1] +|| STW A19,*${KPB}[7] +;;==================================================================== + SUB B0,1,B0 ; skip last round +|| ADD B30,A0,$KPA ; skip first round +|| ADD B30,A0,$KPB +|| MVC GFPGFR,B30 ; save GFPGFR + LDW *${KPA}[0],$K[0] +|| LDW *${KPB}[1],$K[1] +|| MVC B3,GFPGFR + LDW *${KPA}[2],$K[2] +|| LDW *${KPB}[3],$K[3] + MVK 0x00000909,A24 +|| MVK 0x00000B0B,B24 + MVKH 0x09090000,A24 +|| MVKH 0x0B0B0000,B24 + SUB B0,1,B0 + + GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| GMPY4 $K[1],A24,$Kx9[1] +|| MVK 0x00000D0D,A25 +|| MVK 0x00000E0E,B25 + GMPY4 $K[2],A24,$Kx9[2] +|| GMPY4 $K[3],A24,$Kx9[3] +|| MVKH 0x0D0D0000,A25 +|| MVKH 0x0E0E0000,B25 + + GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| GMPY4 $K[1],B24,$KxB[1] + GMPY4 $K[2],B24,$KxB[2] +|| GMPY4 $K[3],B24,$KxB[3] + +;;==================================================================== +invmix_loop?: + GMPY4 $K[0],A25,$KxD[0] ; ·0x0D +|| GMPY4 $K[1],A25,$KxD[1] +|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16 +|| SWAP2 $Kx9[1],$Kx9[1] +|| MV $K[0],$s[0] ; this or DINT +|| MV $K[1],$s[1] +|| [B0] LDW *${KPA}[4],$K[0] +|| [B0] LDW *${KPB}[5],$K[1] + GMPY4 $K[2],A25,$KxD[2] +|| GMPY4 $K[3],A25,$KxD[3] +|| SWAP2 $Kx9[2],$Kx9[2] +|| SWAP2 $Kx9[3],$Kx9[3] +|| MV $K[2],$s[2] +|| MV $K[3],$s[3] +|| [B0] LDW *${KPA}[6],$K[2] +|| [B0] LDW *${KPB}[7],$K[3] + + GMPY4 $s[0],B25,$KxE[0] ; ·0x0E +|| GMPY4 $s[1],B25,$KxE[1] +|| XOR $Kx9[0],$KxB[0],$KxB[0] +|| XOR $Kx9[1],$KxB[1],$KxB[1] + GMPY4 $s[2],B25,$KxE[2] +|| GMPY4 $s[3],B25,$KxE[3] +|| XOR $Kx9[2],$KxB[2],$KxB[2] +|| XOR $Kx9[3],$KxB[3],$KxB[3] + + ROTL $KxB[0],TBL3,$KxB[0] +|| ROTL $KxB[1],TBL3,$KxB[1] +|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16 +|| SWAP2 $KxD[1],$KxD[1] + ROTL $KxB[2],TBL3,$KxB[2] +|| ROTL $KxB[3],TBL3,$KxB[3] +|| SWAP2 $KxD[2],$KxD[2] +|| SWAP2 $KxD[3],$KxD[3] +|| [B0] B invmix_loop? + + XOR $KxE[0],$KxD[0],$KxE[0] +|| XOR $KxE[1],$KxD[1],$KxE[1] +|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| [B0] GMPY4 $K[1],A24,$Kx9[1] +|| ADDAW $KPA,4,$KPA + XOR $KxE[2],$KxD[2],$KxE[2] +|| XOR $KxE[3],$KxD[3],$KxE[3] +|| [B0] GMPY4 $K[2],A24,$Kx9[2] +|| [B0] GMPY4 $K[3],A24,$Kx9[3] +|| ADDAW $KPB,4,$KPB + + XOR $KxB[0],$KxE[0],$KxE[0] +|| XOR $KxB[1],$KxE[1],$KxE[1] +|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| [B0] GMPY4 $K[1],B24,$KxB[1] + XOR $KxB[2],$KxE[2],$KxE[2] +|| XOR $KxB[3],$KxE[3],$KxE[3] +|| [B0] GMPY4 $K[2],B24,$KxB[2] +|| [B0] GMPY4 $K[3],B24,$KxB[3] +|| STW $KxE[0],*${KPA}[-4] +|| STW $KxE[1],*${KPB}[-3] + STW $KxE[2],*${KPA}[-2] +|| STW $KxE[3],*${KPB}[-1] +|| [B0] SUB B0,1,B0 +;;==================================================================== + BNOP B31,3 + MVC B30,GFPGFR ; restore GFPGFR(*) + MVK 0,RET + .endasmfunc +___ +# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there +# are code samples out there that *assume* its default value. +} +{ +my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8"); +$code.=<<___; + .global _AES_ctr32_encrypt +_AES_ctr32_encrypt: + .asmfunc + LDNDW *${ivp}[0],A31:A30 ; load counter value +|| MV $blocks,A2 ; reassign $blocks +|| MV RA,B27 ; reassign RA +|| MV $key,B26 ; reassign $key + LDNDW *${ivp}[1],B31:B30 +|| MVK 0,B2 ; don't let __encrypt load input +|| MVK 0,A1 ; and postpone writing output + .if .BIG_ENDIAN + NOP + .else + NOP 4 + SWAP2 B31,B31 ; keep least significant 32 bits + SWAP4 B31,B31 ; in host byte order + .endif +ctr32_loop?: + [A2] BNOP __encrypt +|| [A1] XOR A29,A9,A9 ; input^Ek(counter) +|| [A1] XOR A28,A8,A8 +|| [A2] LDNDW *INP++,A29:A28 ; load input + [!A2] BNOP B27 ; return +|| [A1] XOR B29,B9,B9 +|| [A1] XOR B28,B8,B8 +|| [A2] LDNDW *INP++,B29:B28 + .if .BIG_ENDIAN + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] MV A31,A9 ; pass counter value to __encrypt +|| [A2] MV A30,A8 ; pass counter value to __encrypt + [A1] STNDW B9:B8,*OUT++ +|| [A2] DMV B31,B30,B9:B8 +|| [A2] ADD B30,1,B30 ; counter++ + .else + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] MV A31,A9 +|| [A2] MV A30,A8 +|| [A2] SWAP2 B31,B0 +|| [A2] ADD B31,1,B31 ; counter++ + [A1] STNDW B9:B8,*OUT++ +|| [A2] MV B30,B8 +|| [A2] SWAP4 B0,B9 + .endif + [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop? +|| [A2] MV B26,KEY ; pass $key +|| [A2] SUB A2,1,A2 ; $blocks-- +||[!A1] MVK 1,A1 + NOP + NOP + .endasmfunc +___ +} +# Tables are kept in endian-neutral manner +$code.=<<___; + .if __TI_EABI__ + .sect ".text:aes_asm.const" + .else + .sect ".const:aes_asm" + .endif + .align 128 +AES_Te: + .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 + .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d + .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd + .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 + .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 + .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d + .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 + .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a + .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d + .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 + .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb + .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b + .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 + .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea + .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 + .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b + .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c + .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a + .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 + .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f + .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 + .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 + .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 + .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f + .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 + .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e + .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 + .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 + .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 + .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d + .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 + .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f + .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e + .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e + .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 + .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb + .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d + .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce + .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e + .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 + .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 + .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c + .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f + .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed + .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 + .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b + .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 + .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a + .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a + .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 + .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 + .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 + .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 + .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 + .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 + .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 + .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe + .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a + .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc + .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 + .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 + .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 + .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a + .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d + .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 + .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f + .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 + .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 + .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 + .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 + .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 + .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 + .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 + .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f + .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e + .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 + .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 + .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c + .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 + .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 + .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 + .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e + .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a + .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 + .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e + .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 + .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 + .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b + .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 + .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 + .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 + .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 + .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa + .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 + .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e + .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 + .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 + .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 + .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 + .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 + .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c + .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 + .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc + .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 + .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 + .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa + .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 + .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 + .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f + .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 + .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 + .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 + .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 + .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 + .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 + .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 + .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 + .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 + .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff + .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a + .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 + .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 + .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 + .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 + .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 + .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 + .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc + .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a +AES_Te4: + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +rcon: + .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 + .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 + .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 + .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 + .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + .align 128 +AES_Td: + .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 + .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 + .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 + .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 + .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 + .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 + .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 + .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f + .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 + .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 + .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 + .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 + .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 + .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda + .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 + .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 + .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 + .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd + .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 + .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 + .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 + .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 + .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 + .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 + .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 + .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 + .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 + .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a + .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 + .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 + .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 + .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c + .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 + .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 + .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 + .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a + .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 + .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 + .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa + .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 + .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d + .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 + .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 + .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff + .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 + .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 + .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 + .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb + .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 + .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 + .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 + .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e + .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 + .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 + .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 + .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a + .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f + .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e + .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 + .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 + .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 + .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d + .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad + .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 + .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c + .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd + .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc + .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 + .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc + .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 + .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 + .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 + .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 + .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d + .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 + .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 + .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 + .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 + .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a + .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef + .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 + .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 + .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 + .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 + .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d + .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 + .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 + .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 + .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c + .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 + .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 + .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b + .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 + .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 + .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e + .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 + .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce + .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 + .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 + .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 + .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 + .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 + .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 + .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f + .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d + .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf + .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b + .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f + .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d + .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e + .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 + .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 + .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a + .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 + .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 + .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c + .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f + .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf + .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b + .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 + .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e + .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f + .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c + .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 + .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde + .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 + .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 + .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 +AES_Td4: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .cstring "AES for C64x, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64xplus.pl new file mode 100644 index 0000000000..206d7dce88 --- /dev/null +++ b/crypto/aes/asm/aes-c64xplus.pl @@ -0,0 +1,1329 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [Endian-neutral] AES for C64x+. +# +# Even though SPLOOPs are scheduled for 13 cycles, and thus expected +# performance is ~8.5 cycles per byte processed with 128-bit key, +# measured performance turned to be ~10 cycles per byte. Discrepancy +# must be caused by limitations of L1D memory banking(*), see SPRU871 +# TI publication for further details. If any consolation it's still +# ~20% faster than TI's linear assembly module anyway... Compared to +# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this +# code is 3.75x faster and almost 3x smaller (tables included). +# +# (*) This means that there might be subtle correlation between data +# and timing and one can wonder if it can be ... attacked:-( +# On the other hand this also means that *if* one chooses to +# implement *4* T-tables variant [instead of 1 T-table as in +# this implementation, or in addition to], then one ought to +# *interleave* them. Even though it complicates addressing, +# references to interleaved tables would be guaranteed not to +# clash. I reckon that it should be possible to break 8 cycles +# per byte "barrier," i.e. improve by ~20%, naturally at the +# cost of 8x increased pressure on L1D. 8x because you'd have +# to interleave both Te and Td tables... + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($TEA,$TEB)=("A5","B5"); +($KPA,$KPB)=("A3","B1"); +@K=("A6","B6","A7","B7"); +@s=("A8","B8","A9","B9"); +@Te0=@Td0=("A16","B16","A17","B17"); +@Te1=@Td1=("A18","B18","A19","B19"); +@Te2=@Td2=("A20","B20","A21","B21"); +@Te3=@Td3=("A22","B22","A23","B23"); + +$code=<<___; + .text + + .asg B3,RA + .asg A4,INP + .asg B4,OUT + .asg A6,KEY + .asg A4,RET + .asg B15,SP + + .eval 24,EXT0 + .eval 16,EXT1 + .eval 8,EXT2 + .eval 0,EXT3 + .eval 8,TBL1 + .eval 16,TBL2 + .eval 24,TBL3 + + .if .BIG_ENDIAN + .eval 24-EXT0,EXT0 + .eval 24-EXT1,EXT1 + .eval 24-EXT2,EXT2 + .eval 24-EXT3,EXT3 + .eval 32-TBL1,TBL1 + .eval 32-TBL2,TBL2 + .eval 32-TBL3,TBL3 + .endif + + .global _AES_encrypt +_AES_encrypt: + .asmfunc + MVK 1,B2 +__encrypt: + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Te-_AES_encrypt),$TEA +|| ADDKPC _AES_encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Te-_AES_encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + LDW *$KPA++[2],$Te0[0] ; zero round key +|| LDW *$KPB++[2],$Te0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Te + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Te) + LDW *$KPA++[2],$Te0[2] +|| LDW *$KPB++[2],$Te0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Te0[0],$s[0],$s[0] +|| XOR $Te0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| ROTL $Te1[1],TBL1,$Te3[0] ; t0 +|| ROTL $Te3[0],TBL3,$Te1[1] ; t1 +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| ROTL $Te3[1],TBL3,$Te1[0] ; t2 +|| ROTL $Te1[0],TBL1,$Te3[1] ; t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| ROTL $Te2[2],TBL2,$Te2[2] ; t0 +|| ROTL $Te2[3],TBL2,$Te2[3] ; t1 +|| XOR $K[0],$Te3[0],$s[0] +|| XOR $K[1],$Te1[1],$s[1] + ROTL $Te3[3],TBL3,$Te1[2] ; t0 +|| ROTL $Te1[2],TBL1,$Te3[3] ; t1 +|| XOR $K[2],$Te1[0],$s[2] +|| XOR $K[3],$Te3[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Te2[0],TBL2,$Te2[0] ; t2 +|| ROTL $Te2[1],TBL2,$Te2[1] ; t3 +|| XOR $s[0],$Te2[2],$s[0] +|| XOR $s[1],$Te2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Te1[3],TBL1,$Te3[2] ; t2 +|| ROTL $Te3[2],TBL3,$Te1[3] ; t3 +|| XOR $s[0],$Te1[2],$s[0] +|| XOR $s[1],$Te3[3],$s[1] + XOR $s[2],$Te2[0],$s[2] +|| XOR $s[3],$Te2[1],$s[3] +|| XOR $s[0],$Te0[0],$s[0] +|| XOR $s[1],$Te0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Te3[2],$s[2] +|| XOR.L $s[3],$Te1[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Te4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 + + .if .BIG_ENDIAN + PACK2 $Te0[0],$Te1[1],$Te0[0] +|| PACK2 $Te0[1],$Te1[2],$Te0[1] + PACK2 $Te2[2],$Te3[3],$Te2[2] +|| PACK2 $Te2[3],$Te3[0],$Te2[3] + PACKL4 $Te0[0],$Te2[2],$Te0[0] +|| PACKL4 $Te0[1],$Te2[3],$Te0[1] + XOR $K[0],$Te0[0],$Te0[0] ; s[0] +|| XOR $K[1],$Te0[1],$Te0[1] ; s[1] + + PACK2 $Te0[2],$Te1[3],$Te0[2] +|| PACK2 $Te0[3],$Te1[0],$Te0[3] + PACK2 $Te2[0],$Te3[1],$Te2[0] +|| PACK2 $Te2[1],$Te3[2],$Te2[1] +|| BNOP RA + PACKL4 $Te0[2],$Te2[0],$Te0[2] +|| PACKL4 $Te0[3],$Te2[1],$Te0[3] + XOR $K[2],$Te0[2],$Te0[2] ; s[2] +|| XOR $K[3],$Te0[3],$Te0[3] ; s[3] + + MV $Te0[0],A9 +|| MV $Te0[1],A8 + MV $Te0[2],B9 +|| MV $Te0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Te1[1],$Te0[0],$Te1[1] +|| PACK2 $Te1[2],$Te0[1],$Te1[2] + PACK2 $Te3[3],$Te2[2],$Te3[3] +|| PACK2 $Te3[0],$Te2[3],$Te3[0] + PACKL4 $Te3[3],$Te1[1],$Te1[1] +|| PACKL4 $Te3[0],$Te1[2],$Te1[2] + XOR $K[0],$Te1[1],$Te1[1] ; s[0] +|| XOR $K[1],$Te1[2],$Te1[2] ; s[1] + + PACK2 $Te1[3],$Te0[2],$Te1[3] +|| PACK2 $Te1[0],$Te0[3],$Te1[0] + PACK2 $Te3[1],$Te2[0],$Te3[1] +|| PACK2 $Te3[2],$Te2[1],$Te3[2] +|| BNOP RA + PACKL4 $Te3[1],$Te1[3],$Te1[3] +|| PACKL4 $Te3[2],$Te1[0],$Te1[0] + XOR $K[2],$Te1[3],$Te1[3] ; s[2] +|| XOR $K[3],$Te1[0],$Te1[0] ; s[3] + + MV $Te1[1],A8 +|| MV $Te1[2],A9 + MV $Te1[3],B8 +|| MV $Te1[0],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc + + .global _AES_decrypt +_AES_decrypt: + .asmfunc + MVK 1,B2 +__decrypt: + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Td-_AES_decrypt),$TEA +|| ADDKPC _AES_decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Td-_AES_decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + LDW *$KPA++[2],$Td0[0] ; zero round key +|| LDW *$KPB++[2],$Td0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Td + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Td) + LDW *$KPA++[2],$Td0[2] +|| LDW *$KPB++[2],$Td0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Td0[0],$s[0],$s[0] +|| XOR $Td0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| ROTL $Td3[1],TBL3,$Td1[0] ; t0 +|| ROTL $Td1[0],TBL1,$Td3[1] ; t1 +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| ROTL $Td1[1],TBL1,$Td3[0] ; t2 +|| ROTL $Td3[0],TBL3,$Td1[1] ; t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 +|| ROTL $Td2[2],TBL2,$Td2[2] ; t0 +|| ROTL $Td2[3],TBL2,$Td2[3] ; t1 +|| XOR $K[0],$Td1[0],$s[0] +|| XOR $K[1],$Td3[1],$s[1] + ROTL $Td1[3],TBL1,$Td3[2] ; t0 +|| ROTL $Td3[2],TBL3,$Td1[3] ; t1 +|| XOR $K[2],$Td3[0],$s[2] +|| XOR $K[3],$Td1[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Td2[0],TBL2,$Td2[0] ; t2 +|| ROTL $Td2[1],TBL2,$Td2[1] ; t3 +|| XOR $s[0],$Td2[2],$s[0] +|| XOR $s[1],$Td2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Td3[3],TBL3,$Td1[2] ; t2 +|| ROTL $Td1[2],TBL1,$Td3[3] ; t3 +|| XOR $s[0],$Td3[2],$s[0] +|| XOR $s[1],$Td1[3],$s[1] + XOR $s[2],$Td2[0],$s[2] +|| XOR $s[3],$Td2[1],$s[3] +|| XOR $s[0],$Td0[0],$s[0] +|| XOR $s[1],$Td0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Td1[2],$s[2] +|| XOR.L $s[3],$Td3[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Td4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 + + .if .BIG_ENDIAN + PACK2 $Td0[0],$Td1[3],$Td0[0] +|| PACK2 $Td0[1],$Td1[0],$Td0[1] + PACK2 $Td2[2],$Td3[1],$Td2[2] +|| PACK2 $Td2[3],$Td3[2],$Td2[3] + PACKL4 $Td0[0],$Td2[2],$Td0[0] +|| PACKL4 $Td0[1],$Td2[3],$Td0[1] + XOR $K[0],$Td0[0],$Td0[0] ; s[0] +|| XOR $K[1],$Td0[1],$Td0[1] ; s[1] + + PACK2 $Td0[2],$Td1[1],$Td0[2] +|| PACK2 $Td0[3],$Td1[2],$Td0[3] + PACK2 $Td2[0],$Td3[3],$Td2[0] +|| PACK2 $Td2[1],$Td3[0],$Td2[1] +|| BNOP RA + PACKL4 $Td0[2],$Td2[0],$Td0[2] +|| PACKL4 $Td0[3],$Td2[1],$Td0[3] + XOR $K[2],$Td0[2],$Td0[2] ; s[2] +|| XOR $K[3],$Td0[3],$Td0[3] ; s[3] + + MV $Td0[0],A9 +|| MV $Td0[1],A8 + MV $Td0[2],B9 +|| MV $Td0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Td1[3],$Td0[0],$Td1[3] +|| PACK2 $Td1[0],$Td0[1],$Td1[0] + PACK2 $Td3[1],$Td2[2],$Td3[1] +|| PACK2 $Td3[2],$Td2[3],$Td3[2] + PACKL4 $Td3[1],$Td1[3],$Td1[3] +|| PACKL4 $Td3[2],$Td1[0],$Td1[0] + XOR $K[0],$Td1[3],$Td1[3] ; s[0] +|| XOR $K[1],$Td1[0],$Td1[0] ; s[1] + + PACK2 $Td1[1],$Td0[2],$Td1[1] +|| PACK2 $Td1[2],$Td0[3],$Td1[2] + PACK2 $Td3[3],$Td2[0],$Td3[3] +|| PACK2 $Td3[0],$Td2[1],$Td3[0] +|| BNOP RA + PACKL4 $Td3[3],$Td1[1],$Td1[1] +|| PACKL4 $Td3[0],$Td1[2],$Td1[2] + XOR $K[2],$Td1[1],$Td1[1] ; s[2] +|| XOR $K[3],$Td1[2],$Td1[2] ; s[3] + + MV $Td1[3],A8 +|| MV $Td1[0],A9 + MV $Td1[1],B8 +|| MV $Td1[2],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc +___ +{ +my @K=(@K,@s); # extended key +my @Te4=map("B$_",(16..19)); + +my @Kx9=@Te0; # used in AES_set_decrypt_key +my @KxB=@Te1; +my @KxD=@Te2; +my @KxE=@Te3; + +$code.=<<___; + .asg OUT,BITS + + .global _AES_set_encrypt_key +_AES_set_encrypt_key: +__set_encrypt_key: + .asmfunc + MV INP,A0 +|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8 +|| MV KEY,A1 + [!A0] B RA +||[!A0] MVK -1,RET +||[!A0] MVK 1,A1 ; only one B RA + [!A1] B RA +||[!A1] MVK -1,RET +||[!A1] MVK 0,A0 +|| MVK 0,B0 +|| MVK 0,A1 + [A0] LDNDW *INP++,A9:A8 +|| [A0] CMPEQ 4,BITS,B0 +|| [A0] CMPLT 3,BITS,A1 + [B0] B key128? +|| [A1] LDNDW *INP++,B9:B8 +|| [A0] CMPEQ 6,BITS,B0 +|| [A0] CMPLT 5,BITS,A1 + [B0] B key192? +|| [A1] LDNDW *INP++,B17:B16 +|| [A0] CMPEQ 8,BITS,B0 +|| [A0] CMPLT 7,BITS,A1 + [B0] B key256? +|| [A1] LDNDW *INP++,B19:B18 + + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL (AES_Te4-_AES_set_encrypt_key),$TEA +|| [A0] ADDKPC _AES_set_encrypt_key,B6 + [A0] MVKH (AES_Te4-_AES_set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + NOP + NOP + + BNOP RA,5 +|| MVK -2,RET ; unknown bit lenght +|| MVK 0,B0 ; redundant +;;==================================================================== +;;==================================================================== +key128?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$Te4[2] +|| MV B8,$K[3] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$Te4[2] +|| MV B9,$K[3] + .endif + + MVK 256,A0 +|| MVK 9,B0 + + SPLOOPD 14 +|| MVC B0,ILC +|| MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[2] +|| EXTU $K[3],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[3],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + SPKERNEL +;;==================================================================== + BNOP RA + MV $Te4[2],$K[2] +|| STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 10,B0 ; rounds + STW B0,*++${KPB}[15] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key192?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$Te4[2] +|| MV B16,$K[5] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$Te4[2] +|| MV B17,$K[5] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop192?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[4] +|| EXTU $K[5],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[5],A0 +|| EXTU $K[5],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[5],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + BDEC loop192?,B0 +|| XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + MV $Te4[2],$K[2] +|| XOR $K[3],$K[4],$Te4[2] ; K[4] + XOR $Te4[2],$K[5],$K[5] ; K[5] +;;==================================================================== + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 12,B0 ; rounds + STW B0,*++${KPB}[7] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key256?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$K[4] +|| MV B16,$K[5] +|| MV B19,$Te4[2] +|| MV B18,$K[7] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$K[4] +|| MV B17,$K[5] +|| MV B18,$Te4[2] +|| MV B19,$K[7] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop256?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[6] +|| EXTU $K[7],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[7],A0 +|| EXTU $K[7],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[7],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + STW $K[6],*$KPA++[2] +|| STW $K[7],*$KPB++[2] +|| XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] +||[!B0] B done256? + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] +||[!B0] B done256? + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + + MV $Te4[2],$K[2] +|| [B0] EXTU $K[3],EXT0,24,$Te4[0] +|| [B0] SUB B0,1,B0 + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT1,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT2,24,A0 +|| EXTU $K[3],EXT3,24,$Te4[3] + + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + NOP 3 + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + NOP 3 + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + + XOR $Te4[3],$K[4],$Te4[0] ; K[4] + XOR $Te4[0],$K[5],$K[5] ; K[5] + MV $Te4[0],$K[4] +|| XOR $K[5],$K[6],$Te4[2] ; K[6] + XOR $Te4[2],$K[7],$K[7] ; K[7] +;;==================================================================== +done256?: + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 14,B0 ; rounds + STW B0,*--${KPB}[1] + MVK 0,RET + .endasmfunc + + .global _AES_set_decrypt_key +_AES_set_decrypt_key: + .asmfunc + B __set_encrypt_key ; guarantee local call + MV KEY,B30 ; B30 is not modified + MV RA, B31 ; B31 is not modified + ADDKPC ret?,RA,2 +ret?: ; B0 holds rounds or zero + [!B0] BNOP B31 ; return if zero + [B0] SHL B0,4,A0 ; offset to last round key + [B0] SHRU B0,1,B1 + [B0] SUB B1,1,B1 + [B0] MVK 0x0000001B,B3 ; AES polynomial + [B0] MVKH 0x07000000,B3 + + SPLOOPD 9 ; flip round keys +|| MVC B1,ILC +|| MV B30,$KPA +|| ADD B30,A0,$KPB +|| MVK 16,A0 ; sizeof(round key) +;;==================================================================== + LDW *${KPA}[0],A16 +|| LDW *${KPB}[0],B16 + LDW *${KPA}[1],A17 +|| LDW *${KPB}[1],B17 + LDW *${KPA}[2],A18 +|| LDW *${KPB}[2],B18 + LDW *${KPA}[3],A19 +|| ADD $KPA,A0,$KPA +|| LDW *${KPB}[3],B19 +|| SUB $KPB,A0,$KPB + NOP + STW B16,*${KPA}[-4] +|| STW A16,*${KPB}[4] + STW B17,*${KPA}[-3] +|| STW A17,*${KPB}[5] + STW B18,*${KPA}[-2] +|| STW A18,*${KPB}[6] + STW B19,*${KPA}[-1] +|| STW A19,*${KPB}[7] + SPKERNEL +;;==================================================================== + SUB B0,1,B0 ; skip last round +|| ADD B30,A0,$KPA ; skip first round +|| ADD B30,A0,$KPB +|| MVC GFPGFR,B30 ; save GFPGFR + LDW *${KPA}[0],$K[0] +|| LDW *${KPB}[1],$K[1] +|| MVC B3,GFPGFR + LDW *${KPA}[2],$K[2] +|| LDW *${KPB}[3],$K[3] + MVK 0x00000909,A24 +|| MVK 0x00000B0B,B24 + MVKH 0x09090000,A24 +|| MVKH 0x0B0B0000,B24 + MVC B0,ILC +|| SUB B0,1,B0 + + GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| GMPY4 $K[1],A24,$Kx9[1] +|| MVK 0x00000D0D,A25 +|| MVK 0x00000E0E,B25 + GMPY4 $K[2],A24,$Kx9[2] +|| GMPY4 $K[3],A24,$Kx9[3] +|| MVKH 0x0D0D0000,A25 +|| MVKH 0x0E0E0000,B25 + + GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| GMPY4 $K[1],B24,$KxB[1] + GMPY4 $K[2],B24,$KxB[2] +|| GMPY4 $K[3],B24,$KxB[3] + + SPLOOP 11 ; InvMixColumns +;;==================================================================== + GMPY4 $K[0],A25,$KxD[0] ; ·0x0D +|| GMPY4 $K[1],A25,$KxD[1] +|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16 +|| SWAP2 $Kx9[1],$Kx9[1] +|| MV $K[0],$s[0] ; this or DINT +|| MV $K[1],$s[1] +|| [B0] LDW *${KPA}[4],$K[0] +|| [B0] LDW *${KPB}[5],$K[1] + GMPY4 $K[2],A25,$KxD[2] +|| GMPY4 $K[3],A25,$KxD[3] +|| SWAP2 $Kx9[2],$Kx9[2] +|| SWAP2 $Kx9[3],$Kx9[3] +|| MV $K[2],$s[2] +|| MV $K[3],$s[3] +|| [B0] LDW *${KPA}[6],$K[2] +|| [B0] LDW *${KPB}[7],$K[3] + + GMPY4 $s[0],B25,$KxE[0] ; ·0x0E +|| GMPY4 $s[1],B25,$KxE[1] +|| XOR $Kx9[0],$KxB[0],$KxB[0] +|| XOR $Kx9[1],$KxB[1],$KxB[1] + GMPY4 $s[2],B25,$KxE[2] +|| GMPY4 $s[3],B25,$KxE[3] +|| XOR $Kx9[2],$KxB[2],$KxB[2] +|| XOR $Kx9[3],$KxB[3],$KxB[3] + + ROTL $KxB[0],TBL3,$KxB[0] +|| ROTL $KxB[1],TBL3,$KxB[1] +|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16 +|| SWAP2 $KxD[1],$KxD[1] + ROTL $KxB[2],TBL3,$KxB[2] +|| ROTL $KxB[3],TBL3,$KxB[3] +|| SWAP2 $KxD[2],$KxD[2] +|| SWAP2 $KxD[3],$KxD[3] + + XOR $KxE[0],$KxD[0],$KxE[0] +|| XOR $KxE[1],$KxD[1],$KxE[1] +|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| [B0] GMPY4 $K[1],A24,$Kx9[1] +|| ADDAW $KPA,4,$KPA + XOR $KxE[2],$KxD[2],$KxE[2] +|| XOR $KxE[3],$KxD[3],$KxE[3] +|| [B0] GMPY4 $K[2],A24,$Kx9[2] +|| [B0] GMPY4 $K[3],A24,$Kx9[3] +|| ADDAW $KPB,4,$KPB + + XOR $KxB[0],$KxE[0],$KxE[0] +|| XOR $KxB[1],$KxE[1],$KxE[1] +|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| [B0] GMPY4 $K[1],B24,$KxB[1] + XOR $KxB[2],$KxE[2],$KxE[2] +|| XOR $KxB[3],$KxE[3],$KxE[3] +|| [B0] GMPY4 $K[2],B24,$KxB[2] +|| [B0] GMPY4 $K[3],B24,$KxB[3] +|| STW $KxE[0],*${KPA}[-4] +|| STW $KxE[1],*${KPB}[-3] + STW $KxE[2],*${KPA}[-2] +|| STW $KxE[3],*${KPB}[-1] +|| [B0] SUB B0,1,B0 + SPKERNEL +;;==================================================================== + BNOP B31,3 + MVC B30,GFPGFR ; restore GFPGFR(*) + MVK 0,RET + .endasmfunc +___ +# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there +# are code samples out there that *assume* its default value. +} +{ +my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8"); +$code.=<<___; + .global _AES_ctr32_encrypt +_AES_ctr32_encrypt: + .asmfunc + LDNDW *${ivp}[0],A31:A30 ; load counter value +|| MV $blocks,A2 ; reassign $blocks +|| DMV RA,$key,B27:B26 ; reassign RA and $key + LDNDW *${ivp}[1],B31:B30 +|| MVK 0,B2 ; don't let __encrypt load input +|| MVK 0,A1 ; and postpone writing output + .if .BIG_ENDIAN + NOP + .else + NOP 4 + SWAP2 B31,B31 ; keep least significant 32 bits + SWAP4 B31,B31 ; in host byte order + .endif +ctr32_loop?: + [A2] BNOP __encrypt +|| [A1] XOR A29,A9,A9 ; input^Ek(counter) +|| [A1] XOR A28,A8,A8 +|| [A2] LDNDW *INP++,A29:A28 ; load input + [!A2] BNOP B27 ; return +|| [A1] XOR B29,B9,B9 +|| [A1] XOR B28,B8,B8 +|| [A2] LDNDW *INP++,B29:B28 + .if .BIG_ENDIAN + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 ; pass counter value to __encrypt + [A1] STNDW B9:B8,*OUT++ +|| [A2] DMV B31,B30,B9:B8 +|| [A2] ADD B30,1,B30 ; counter++ + .else + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 +|| [A2] SWAP2 B31,B0 +|| [A2] ADD B31,1,B31 ; counter++ + [A1] STNDW B9:B8,*OUT++ +|| [A2] MV B30,B8 +|| [A2] SWAP4 B0,B9 + .endif + [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop? +|| [A2] MV B26,KEY ; pass $key +|| [A2] SUB A2,1,A2 ; $blocks-- +||[!A1] MVK 1,A1 + NOP + NOP + .endasmfunc +___ +} +# Tables are kept in endian-neutral manner +$code.=<<___; + .sect ".const:aes_asm" + .align 128 +AES_Te: + .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 + .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d + .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd + .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 + .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 + .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d + .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 + .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a + .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d + .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 + .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb + .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b + .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 + .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea + .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 + .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b + .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c + .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a + .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 + .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f + .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 + .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 + .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 + .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f + .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 + .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e + .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 + .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 + .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 + .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d + .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 + .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f + .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e + .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e + .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 + .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb + .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d + .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce + .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e + .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 + .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 + .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c + .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f + .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed + .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 + .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b + .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 + .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a + .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a + .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 + .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 + .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 + .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 + .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 + .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 + .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 + .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe + .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a + .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc + .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 + .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 + .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 + .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a + .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d + .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 + .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f + .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 + .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 + .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 + .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 + .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 + .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 + .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 + .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f + .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e + .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 + .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 + .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c + .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 + .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 + .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 + .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e + .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a + .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 + .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e + .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 + .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 + .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b + .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 + .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 + .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 + .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 + .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa + .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 + .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e + .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 + .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 + .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 + .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 + .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 + .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c + .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 + .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc + .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 + .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 + .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa + .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 + .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 + .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f + .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 + .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 + .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 + .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 + .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 + .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 + .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 + .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 + .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 + .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff + .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a + .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 + .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 + .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 + .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 + .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 + .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 + .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc + .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a +AES_Te4: + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +rcon: + .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 + .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 + .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 + .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 + .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + .align 128 +AES_Td: + .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 + .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 + .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 + .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 + .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 + .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 + .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 + .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f + .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 + .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 + .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 + .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 + .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 + .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda + .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 + .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 + .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 + .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd + .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 + .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 + .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 + .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 + .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 + .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 + .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 + .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 + .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 + .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a + .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 + .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 + .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 + .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c + .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 + .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 + .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 + .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a + .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 + .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 + .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa + .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 + .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d + .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 + .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 + .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff + .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 + .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 + .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 + .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb + .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 + .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 + .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 + .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e + .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 + .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 + .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 + .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a + .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f + .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e + .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 + .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 + .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 + .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d + .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad + .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 + .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c + .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd + .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc + .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 + .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc + .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 + .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 + .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 + .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 + .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d + .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 + .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 + .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 + .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 + .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a + .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef + .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 + .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 + .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 + .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 + .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d + .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 + .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 + .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 + .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c + .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 + .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 + .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b + .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 + .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 + .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e + .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 + .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce + .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 + .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 + .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 + .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 + .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 + .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 + .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f + .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d + .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf + .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b + .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f + .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d + .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e + .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 + .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 + .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a + .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 + .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 + .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c + .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f + .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf + .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b + .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 + .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e + .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f + .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c + .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 + .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde + .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 + .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 + .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 +AES_Td4: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .cstring "AES for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl index 2ce6deffc8..76cf130e91 100644 --- a/crypto/aes/asm/aes-mips.pl +++ b/crypto/aes/asm/aes-mips.pl @@ -47,7 +47,7 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 @@ -70,7 +70,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0; for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; @@ -89,7 +89,7 @@ $code.=<<___; # include #endif -#if !defined(__vxworks) || defined(__pic__) +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) .option pic2 #endif .set noat diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl index 7c52cbe5f9..58a98232d1 100644 --- a/crypto/aes/asm/aes-ppc.pl +++ b/crypto/aes/asm/aes-ppc.pl @@ -548,7 +548,7 @@ Lenc_loop: xor $s2,$t2,$acc14 xor $s3,$t3,$acc15 addi $key,$key,16 - bdnz- Lenc_loop + bdnz Lenc_loop addi $Tbl2,$Tbl0,2048 nop @@ -982,7 +982,7 @@ Ldec_loop: xor $s2,$t2,$acc14 xor $s3,$t3,$acc15 addi $key,$key,16 - bdnz- Ldec_loop + bdnz Ldec_loop addi $Tbl2,$Tbl0,2048 nop diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl new file mode 100755 index 0000000000..7ef189d249 --- /dev/null +++ b/crypto/aes/asm/aesp8-ppc.pl @@ -0,0 +1,3726 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for AES instructions as per PowerISA +# specification version 2.07, first implemented by POWER8 processor. +# The module is endian-agnostic in sense that it supports both big- +# and little-endian cases. Data alignment in parallelizable modes is +# handled with VSX loads and stores, which implies MSR.VSX flag being +# set. It should also be noted that ISA specification doesn't prohibit +# alignment exceptions for these instructions on page boundaries. +# Initially alignment was handled in pure AltiVec/VMX way [when data +# is aligned programmatically, which in turn guarantees exception- +# free execution], but it turned to hamper performance when vcipher +# instructions are interleaved. It's reckoned that eventual +# misalignment penalties at page boundaries are in average lower +# than additional overhead in pure AltiVec approach. +# +# May 2016 +# +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian +# systems were measured. +# +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS +# POWER8[le] 3.96/0.72 0.74 1.1 +# POWER8[be] 3.75/0.65 0.66 1.0 + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; + $UCMP ="cmpld"; + $SHL ="sldi"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; + $SHL ="slwi"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=8*$SIZE_T; +$prefix="aes_p8"; + +$sp="r1"; +$vrsave="r12"; + +######################################################################### +{{{ # Key setup procedures # +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); + +$code.=<<___; +.machine "any" + +.text + +.align 7 +rcon: +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev +.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev +.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev +.long 0,0,0,0 ?asis +Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr $ptr #vvvvv "distance between . and rcon + addi $ptr,$ptr,-0x48 + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.asciz "AES for PowerISA 2.07, CRYPTOGAMS by " + +.globl .${prefix}_set_encrypt_key +.align 5 +.${prefix}_set_encrypt_key: +Lset_encrypt_key: + mflr r11 + $PUSH r11,$LRSAVE($sp) + + li $ptr,-1 + ${UCMP}i $inp,0 + beq- Lenc_key_abort # if ($inp==0) return -1; + ${UCMP}i $out,0 + beq- Lenc_key_abort # if ($out==0) return -1; + li $ptr,-2 + cmpwi $bits,128 + blt- Lenc_key_abort + cmpwi $bits,256 + bgt- Lenc_key_abort + andi. r0,$bits,0x3f + bne- Lenc_key_abort + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + bl Lconsts + mtlr r11 + + neg r9,$inp + lvx $in0,0,$inp + addi $inp,$inp,15 # 15 is not typo + lvsr $key,0,r9 # borrow $key + li r8,0x20 + cmpwi $bits,192 + lvx $in1,0,$inp + le?vspltisb $mask,0x0f # borrow $mask + lvx $rcon,0,$ptr + le?vxor $key,$key,$mask # adjust for byte swap + lvx $mask,r8,$ptr + addi $ptr,$ptr,0x10 + vperm $in0,$in0,$in1,$key # align [and byte swap in LE] + li $cnt,8 + vxor $zero,$zero,$zero + mtctr $cnt + + ?lvsr $outperm,0,$out + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$zero,$outmask,$outperm + + blt Loop128 + addi $inp,$inp,8 + beq L192 + addi $inp,$inp,8 + b L256 + +.align 4 +Loop128: + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + bdnz Loop128 + + lvx $rcon,0,$ptr # last two round keys + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + + addi $inp,$out,15 # 15 is not typo + addi $out,$out,0x50 + + li $rounds,10 + b Ldone + +.align 4 +L192: + lvx $tmp,0,$inp + li $cnt,4 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + vspltisb $key,8 # borrow $key + mtctr $cnt + vsububm $mask,$mask,$key # adjust the mask + +Loop192: + vperm $key,$in1,$in1,$mask # roate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vcipherlast $key,$key,$rcon + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + + vsldoi $stage,$zero,$in1,8 + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vsldoi $stage,$stage,$in0,8 + + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vsldoi $stage,$in0,$in1,8 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + stvx $stage,0,$out + addi $out,$out,16 + + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdnz Loop192 + + li $rounds,12 + addi $out,$out,0x20 + b Ldone + +.align 4 +L256: + lvx $tmp,0,$inp + li $cnt,7 + li $rounds,14 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + mtctr $cnt + +Loop256: + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in1,$in1,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdz Ldone + + vspltw $key,$in0,3 # just splat + vsldoi $tmp,$zero,$in1,12 # >>32 + vsbox $key,$key + + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + + vxor $in1,$in1,$key + b Loop256 + +.align 4 +Ldone: + lvx $in1,0,$inp # redundant in aligned case + vsel $in1,$outhead,$in1,$outmask + stvx $in1,0,$inp + li $ptr,0 + mtspr 256,$vrsave + stw $rounds,0($out) + +Lenc_key_abort: + mr r3,$ptr + blr + .long 0 + .byte 0,12,0x14,1,0,0,3,0 + .long 0 +.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key + +.globl .${prefix}_set_decrypt_key +.align 5 +.${prefix}_set_decrypt_key: + $STU $sp,-$FRAME($sp) + mflr r10 + $PUSH r10,$FRAME+$LRSAVE($sp) + bl Lset_encrypt_key + mtlr r10 + + cmpwi r3,0 + bne- Ldec_key_abort + + slwi $cnt,$rounds,4 + subi $inp,$out,240 # first round key + srwi $rounds,$rounds,1 + add $out,$inp,$cnt # last round key + mtctr $rounds + +Ldeckey: + lwz r0, 0($inp) + lwz r6, 4($inp) + lwz r7, 8($inp) + lwz r8, 12($inp) + addi $inp,$inp,16 + lwz r9, 0($out) + lwz r10,4($out) + lwz r11,8($out) + lwz r12,12($out) + stw r0, 0($out) + stw r6, 4($out) + stw r7, 8($out) + stw r8, 12($out) + subi $out,$out,16 + stw r9, -16($inp) + stw r10,-12($inp) + stw r11,-8($inp) + stw r12,-4($inp) + bdnz Ldeckey + + xor r3,r3,r3 # return value +Ldec_key_abort: + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,0,3,0 + .long 0 +.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key +___ +}}} +######################################################################### +{{{ # Single block en- and decrypt procedures # +sub gen_block () { +my $dir = shift; +my $n = $dir eq "de" ? "n" : ""; +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); + +$code.=<<___; +.globl .${prefix}_${dir}crypt +.align 5 +.${prefix}_${dir}crypt: + lwz $rounds,240($key) + lis r0,0xfc00 + mfspr $vrsave,256 + li $idx,15 # 15 is not typo + mtspr 256,r0 + + lvx v0,0,$inp + neg r11,$out + lvx v1,$idx,$inp + lvsl v2,0,$inp # inpperm + le?vspltisb v4,0x0f + ?lvsl v3,0,r11 # outperm + le?vxor v2,v2,v4 + li $idx,16 + vperm v0,v0,v1,v2 # align [and byte swap in LE] + lvx v1,0,$key + ?lvsl v5,0,$key # keyperm + srwi $rounds,$rounds,1 + lvx v2,$idx,$key + addi $idx,$idx,16 + subi $rounds,$rounds,1 + ?vperm v1,v1,v2,v5 # align round key + + vxor v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + mtctr $rounds + +Loop_${dir}c: + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + addi $idx,$idx,16 + ?vperm v1,v1,v2,v5 + v${n}cipher v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + bdnz Loop_${dir}c + + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + ?vperm v1,v1,v2,v5 + v${n}cipherlast v0,v0,v1 + + vspltisb v2,-1 + vxor v1,v1,v1 + li $idx,15 # 15 is not typo + ?vperm v2,v1,v2,v3 # outmask + le?vxor v3,v3,v4 + lvx v1,0,$out # outhead + vperm v0,v0,v0,v3 # rotate [and byte swap in LE] + vsel v1,v1,v0,v2 + lvx v4,$idx,$out + stvx v1,0,$out + vsel v0,v0,v4,v2 + stvx v0,$idx,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +######################################################################### +{{{ # CBC en- and decrypt procedures # +my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= + map("v$_",(4..10)); +$code.=<<___; +.globl .${prefix}_cbc_encrypt +.align 5 +.${prefix}_cbc_encrypt: + ${UCMP}i $len,16 + bltlr- + + cmpwi $enc,0 # test direction + lis r0,0xffe0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + beq Lcbc_dec + +Lcbc_enc: + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $inout,$inout,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + vxor $inout,$inout,$ivec + +Loop_cbc_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $ivec,$inout,$rndkey0 + ${UCMP}i $len,16 + + vperm $tmp,$ivec,$ivec,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_enc + + b Lcbc_done + +.align 4 +Lcbc_dec: + ${UCMP}i $len,128 + bge _aesp8_cbc_decrypt8x + vmr $tmp,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $tmp,$tmp,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$tmp,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + +Loop_cbc_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipherlast $inout,$inout,$rndkey0 + ${UCMP}i $len,16 + + vxor $inout,$inout,$ivec + vmr $ivec,$tmp + vperm $tmp,$inout,$inout,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_dec + +Lcbc_done: + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + neg $enc,$ivp # write [unaligned] iv + li $idx,15 # 15 is not typo + vxor $rndkey0,$rndkey0,$rndkey0 + vspltisb $outmask,-1 + le?vspltisb $tmp,0x0f + ?lvsl $outperm,0,$enc + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + lvx $outhead,0,$ivp + vperm $ivec,$ivec,$ivec,$outperm + vsel $inout,$outhead,$ivec,$outmask + lvx $inptail,$idx,$ivp + stvx $inout,0,$ivp + vsel $inout,$ivec,$inptail,$outmask + stvx $inout,$idx,$ivp + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CBC decrypt procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment + +$code.=<<___; +.align 5 +_aesp8_cbc_decrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + subi $len,$len,128 # bias + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_cbc_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_cbc_dec_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + #lvx $inptail,0,$inp # "caller" already did this + #addi $inp,$inp,15 # 15 is not typo + subi $inp,$inp,15 # undo "caller" + + le?li $idx,8 + lvx_u $in0,$x00,$inp # load first 8 "words" + le?lvsl $inpperm,0,$idx + le?vspltisb $tmp,0x0f + lvx_u $in1,$x10,$inp + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + lvx_u $in2,$x20,$inp + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in3,$x30,$inp + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in4,$x40,$inp + le?vperm $in2,$in2,$in2,$inpperm + vxor $out0,$in0,$rndkey0 + lvx_u $in5,$x50,$inp + le?vperm $in3,$in3,$in3,$inpperm + vxor $out1,$in1,$rndkey0 + lvx_u $in6,$x60,$inp + le?vperm $in4,$in4,$in4,$inpperm + vxor $out2,$in2,$rndkey0 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + le?vperm $in5,$in5,$in5,$inpperm + vxor $out3,$in3,$rndkey0 + le?vperm $in6,$in6,$in6,$inpperm + vxor $out4,$in4,$rndkey0 + le?vperm $in7,$in7,$in7,$inpperm + vxor $out5,$in5,$rndkey0 + vxor $out6,$in6,$rndkey0 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + b Loop_cbc_dec8x +.align 5 +Loop_cbc_dec8x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x + + subic $len,$len,128 # $len-=128 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + and r0,r0,$len + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vncipher $out0,$out0,v30 + vxor $ivec,$ivec,v31 # xor with last round key + vncipher $out1,$out1,v30 + vxor $in0,$in0,v31 + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + vncipherlast $out0,$out0,$ivec + vncipherlast $out1,$out1,$in0 + lvx_u $in0,$x00,$inp # load next input block + vncipherlast $out2,$out2,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out3,$out3,$in2 + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in2,$x20,$inp + vncipherlast $out4,$out4,$in3 + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in3,$x30,$inp + vncipherlast $out5,$out5,$in4 + le?vperm $in2,$in2,$in2,$inpperm + lvx_u $in4,$x40,$inp + vncipherlast $out6,$out6,$in5 + le?vperm $in3,$in3,$in3,$inpperm + lvx_u $in5,$x50,$inp + vncipherlast $out7,$out7,$in6 + le?vperm $in4,$in4,$in4,$inpperm + lvx_u $in6,$x60,$inp + vmr $ivec,$in7 + le?vperm $in5,$in5,$in5,$inpperm + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $in6,$in6,$in6,$inpperm + vxor $out0,$in0,$rndkey0 + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $in7,$in7,$in7,$inpperm + vxor $out1,$in1,$rndkey0 + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$rndkey0 + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$rndkey0 + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$rndkey0 + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + vxor $out5,$in5,$rndkey0 + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + vxor $out6,$in6,$rndkey0 + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + beq Loop_cbc_dec8x # did $len-=128 borrow? + + addic. $len,$len,128 + beq Lcbc_dec8x_done + nop + nop + +Loop_cbc_dec8x_tail: # up to 7 "words" tail... + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x_tail + + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + + vncipher $out1,$out1,v30 + vxor $ivec,$ivec,v31 # last round key + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + cmplwi $len,32 # switch($len) + blt Lcbc_dec8x_one + nop + beq Lcbc_dec8x_two + cmplwi $len,64 + blt Lcbc_dec8x_three + nop + beq Lcbc_dec8x_four + cmplwi $len,96 + blt Lcbc_dec8x_five + nop + beq Lcbc_dec8x_six + +Lcbc_dec8x_seven: + vncipherlast $out1,$out1,$ivec + vncipherlast $out2,$out2,$in1 + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out1,$out1,$out1,$inpperm + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x00,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x10,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x20,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x30,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x40,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x50,$out + stvx_u $out7,$x60,$out + addi $out,$out,0x70 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_six: + vncipherlast $out2,$out2,$ivec + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out2,$out2,$out2,$inpperm + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x00,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x10,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x20,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x30,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x40,$out + stvx_u $out7,$x50,$out + addi $out,$out,0x60 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_five: + vncipherlast $out3,$out3,$ivec + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out3,$out3,$out3,$inpperm + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x00,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x10,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x20,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x30,$out + stvx_u $out7,$x40,$out + addi $out,$out,0x50 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_four: + vncipherlast $out4,$out4,$ivec + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out4,$out4,$out4,$inpperm + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x00,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x10,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x20,$out + stvx_u $out7,$x30,$out + addi $out,$out,0x40 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_three: + vncipherlast $out5,$out5,$ivec + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out5,$out5,$out5,$inpperm + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x00,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x10,$out + stvx_u $out7,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_two: + vncipherlast $out6,$out6,$ivec + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out6,$out6,$out6,$inpperm + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x00,$out + stvx_u $out7,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_one: + vncipherlast $out7,$out7,$ivec + vmr $ivec,$in7 + + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out7,0,$out + addi $out,$out,0x10 + +Lcbc_dec8x_done: + le?vperm $ivec,$ivec,$ivec,$inpperm + stvx_u $ivec,0,$ivp # write [unaligned] iv + + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt +___ +}} }}} + +######################################################################### +{{{ # CTR procedure[s] # +my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= + map("v$_",(4..11)); +my $dat=$tmp; + +$code.=<<___; +.globl .${prefix}_ctr32_encrypt_blocks +.align 5 +.${prefix}_ctr32_encrypt_blocks: + ${UCMP}i $len,1 + bltlr- + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + vspltisb $one,1 + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + vsldoi $one,$rndkey0,$one,1 + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + + ${UCMP}i $len,8 + bge _aesp8_ctr32_encrypt8x + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + lvx $rndkey0,0,$key + mtctr $rounds + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + b Loop_ctr32_enc + +.align 5 +Loop_ctr32_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_ctr32_enc + + vadduwm $ivec,$ivec,$one + vmr $dat,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + subic. $len,$len,1 # blocks-- + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + vperm $dat,$dat,$inptail,$inpperm + li $idx,16 + ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm + lvx $rndkey0,0,$key + vxor $dat,$dat,$rndkey1 # last round key + vcipherlast $inout,$inout,$dat + + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + vperm $inout,$inout,$inout,$outperm + vsel $dat,$outhead,$inout,$outmask + mtctr $rounds + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vmr $outhead,$inout + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + stvx $dat,0,$out + addi $out,$out,16 + bne Loop_ctr32_enc + + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CTR procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment +my ($two,$three,$four)=($outhead,$outperm,$outmask); + +$code.=<<___; +.align 5 +_aesp8_ctr32_encrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_ctr32_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_ctr32_enc_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vadduwm $two,$one,$one + subi $inp,$inp,15 # undo "caller" + $SHL $len,$len,4 + + vadduwm $out1,$ivec,$one # counter values ... + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + le?li $idx,8 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + le?lvsl $inpperm,0,$idx + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + le?vspltisb $tmp,0x0f + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + vxor $out7,$out7,$rndkey0 + + mtctr $rounds + b Loop_ctr32_enc8x +.align 5 +Loop_ctr32_enc8x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 +Loop_ctr32_enc8x_middle: + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_ctr32_enc8x + + subic r11,$len,256 # $len-256, borrow $key_ + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 + + subfe r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + + and r0,r0,r11 + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + vcipher $out6,$out6,v26 + vcipher $out7,$out7,v26 + lvx v24,$x00,$key_ # re-pre-load round[1] + + subic $len,$len,129 # $len-=129 + vcipher $out0,$out0,v27 + addi $len,$len,1 # $len-=128 really + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + vcipher $out6,$out6,v27 + vcipher $out7,$out7,v27 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vcipher $out0,$out0,v28 + lvx_u $in0,$x00,$inp # load input + vcipher $out1,$out1,v28 + lvx_u $in1,$x10,$inp + vcipher $out2,$out2,v28 + lvx_u $in2,$x20,$inp + vcipher $out3,$out3,v28 + lvx_u $in3,$x30,$inp + vcipher $out4,$out4,v28 + lvx_u $in4,$x40,$inp + vcipher $out5,$out5,v28 + lvx_u $in5,$x50,$inp + vcipher $out6,$out6,v28 + lvx_u $in6,$x60,$inp + vcipher $out7,$out7,v28 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v29 + le?vperm $in1,$in1,$in1,$inpperm + vcipher $out2,$out2,v29 + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out3,$out3,v29 + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out4,$out4,v29 + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out5,$out5,v29 + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out6,$out6,v29 + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out7,$out7,v29 + le?vperm $in7,$in7,$in7,$inpperm + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + subfe. r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v30 + vxor $in0,$in0,v31 # xor with last round key + vcipher $out1,$out1,v30 + vxor $in1,$in1,v31 + vcipher $out2,$out2,v30 + vxor $in2,$in2,v31 + vcipher $out3,$out3,v30 + vxor $in3,$in3,v31 + vcipher $out4,$out4,v30 + vxor $in4,$in4,v31 + vcipher $out5,$out5,v30 + vxor $in5,$in5,v31 + vcipher $out6,$out6,v30 + vxor $in6,$in6,v31 + vcipher $out7,$out7,v30 + vxor $in7,$in7,v31 + + bne Lctr32_enc8x_break # did $len-129 borrow? + + vcipherlast $in0,$out0,$in0 + vcipherlast $in1,$out1,$in1 + vadduwm $out1,$ivec,$one # counter values ... + vcipherlast $in2,$out2,$in2 + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + vcipherlast $in3,$out3,$in3 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + vcipherlast $in4,$out4,$in4 + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + vcipherlast $in5,$out5,$in5 + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + vcipherlast $in6,$out6,$in6 + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vcipherlast $in7,$out7,$in7 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + le?vperm $in0,$in0,$in0,$inpperm + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + le?vperm $in1,$in1,$in1,$inpperm + vxor $out7,$out7,$rndkey0 + mtctr $rounds + + vcipher $out0,$out0,v24 + stvx_u $in0,$x00,$out + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out1,$out1,v24 + stvx_u $in1,$x10,$out + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out2,$out2,v24 + stvx_u $in2,$x20,$out + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out3,$out3,v24 + stvx_u $in3,$x30,$out + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out4,$out4,v24 + stvx_u $in4,$x40,$out + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out5,$out5,v24 + stvx_u $in5,$x50,$out + le?vperm $in7,$in7,$in7,$inpperm + vcipher $out6,$out6,v24 + stvx_u $in6,$x60,$out + vcipher $out7,$out7,v24 + stvx_u $in7,$x70,$out + addi $out,$out,0x80 + + b Loop_ctr32_enc8x_middle + +.align 5 +Lctr32_enc8x_break: + cmpwi $len,-0x60 + blt Lctr32_enc8x_one + nop + beq Lctr32_enc8x_two + cmpwi $len,-0x40 + blt Lctr32_enc8x_three + nop + beq Lctr32_enc8x_four + cmpwi $len,-0x20 + blt Lctr32_enc8x_five + nop + beq Lctr32_enc8x_six + cmpwi $len,0x00 + blt Lctr32_enc8x_seven + +Lctr32_enc8x_eight: + vcipherlast $out0,$out0,$in0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + vcipherlast $out5,$out5,$in5 + vcipherlast $out6,$out6,$in6 + vcipherlast $out7,$out7,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_seven: + vcipherlast $out0,$out0,$in1 + vcipherlast $out1,$out1,$in2 + vcipherlast $out2,$out2,$in3 + vcipherlast $out3,$out3,$in4 + vcipherlast $out4,$out4,$in5 + vcipherlast $out5,$out5,$in6 + vcipherlast $out6,$out6,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + stvx_u $out6,$x60,$out + addi $out,$out,0x70 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_six: + vcipherlast $out0,$out0,$in2 + vcipherlast $out1,$out1,$in3 + vcipherlast $out2,$out2,$in4 + vcipherlast $out3,$out3,$in5 + vcipherlast $out4,$out4,$in6 + vcipherlast $out5,$out5,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + stvx_u $out5,$x50,$out + addi $out,$out,0x60 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_five: + vcipherlast $out0,$out0,$in3 + vcipherlast $out1,$out1,$in4 + vcipherlast $out2,$out2,$in5 + vcipherlast $out3,$out3,$in6 + vcipherlast $out4,$out4,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_four: + vcipherlast $out0,$out0,$in4 + vcipherlast $out1,$out1,$in5 + vcipherlast $out2,$out2,$in6 + vcipherlast $out3,$out3,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_three: + vcipherlast $out0,$out0,$in5 + vcipherlast $out1,$out1,$in6 + vcipherlast $out2,$out2,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lctr32_enc8x_two: + vcipherlast $out0,$out0,$in6 + vcipherlast $out1,$out1,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lctr32_enc8x_one: + vcipherlast $out0,$out0,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + stvx_u $out0,0,$out + addi $out,$out,0x10 + +Lctr32_enc8x_done: + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks +___ +}} }}} + +######################################################################### +{{{ # XTS procedures # +my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); +my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); +my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); +my $taillen = $key2; + + ($inp,$idx) = ($idx,$inp); # reassign + +$code.=<<___; +.globl .${prefix}_xts_encrypt +.align 5 +.${prefix}_xts_encrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff0 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_encrypt6x + + andi. $taillen,$len,15 + subic r0,$len,32 + subi $taillen,$taillen,16 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + b Loop_xts_enc + +.align 5 +Loop_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vcipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_enc_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + subic r0,$len,32 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $output,$output,$rndkey0 # just in case $len<16 + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_enc + + vxor $output,$output,$tweak + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + subi r11,$out,17 + subi $out,$out,16 + mtctr $len + li $len,16 +Loop_xts_enc_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_enc_steal + + mtctr $rounds + b Loop_xts_enc # one more time... + +Lxts_enc_done: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt + +.globl .${prefix}_xts_decrypt +.align 5 +.${prefix}_xts_decrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff8 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + andi. r0,$len,15 + neg r0,r0 + andi. r0,r0,16 + sub $len,$len,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_decrypt6x + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + + ${UCMP}i $len,16 + blt Ltail_xts_dec + be?b Loop_xts_dec + +.align 5 +Loop_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_dec_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_dec + +Ltail_xts_dec: + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak1,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak1,$tweak1,$tmp + + subi $inp,$inp,16 + add $inp,$inp,$len + + vxor $inout,$inout,$tweak # :-( + vxor $inout,$inout,$tweak1 # :-) + +Loop_xts_dec_short: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec_short + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak1 + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + + vmr $inout,$inptail + lvx $inptail,0,$inp + #addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + vxor $rndkey0,$rndkey0,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + subi r11,$out,1 + mtctr $len + li $len,16 +Loop_xts_dec_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_dec_steal + + mtctr $rounds + b Loop_xts_dec # one more time... + +Lxts_dec_done: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt +___ +######################################################################### +{{ # Optimized XTS procedures # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); +my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); +my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($keyperm)=($out0); # aliases with "caller", redundant assignment +my $taillen=$x70; + +$code.=<<___; +.align 5 +_aesp8_xts_encrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r0 + li r7,`$FRAME+8*16+15` + li r8,`$FRAME+8*16+31` + $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r8,$sp + addi r8,r8,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r8,$sp + addi r8,r8,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r8,$sp + addi r8,r8,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r8,$sp + addi r8,r8,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r8,$sp + addi r8,r8,32 + stvx v30,r7,$sp + stvx v31,r8,$sp + mr r7,r0 + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_enc_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_enc6x + +.align 5 +Loop_xts_enc6x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out0,$out0,v27 + vcipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v28 + vcipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vcipher $out0,$out0,v29 + vcipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vcipher $out4,$out4,v29 + vcipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vcipher $out0,$out0,v30 + vcipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v30 + vcipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vcipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vcipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vcipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vcipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vcipherlast $tmp,$out5,$in5 # last block might be needed + # in stealing mode + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$tmp,$tmp,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + le?stvx_u $out5,$x50,$out + be?stvx_u $tmp, $x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_enc6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_enc6x_zero + cmpwi $len,0x20 + blt Lxts_enc6x_one + nop + beq Lxts_enc6x_two + cmpwi $len,0x40 + blt Lxts_enc6x_three + nop + beq Lxts_enc6x_four + +Lxts_enc6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $tmp,$out4,$twk5 # last block prep for stealing + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $tmp,$out3,$twk4 # last block prep for stealing + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $tmp,$out2,$twk3 # last block prep for stealing + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vxor $tmp,$out1,$twk2 # last block prep for stealing + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_enc1x: + vcipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc1x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + + lvsr $inpperm,0,$taillen + vcipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vcipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vcipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vcipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vcipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vxor $tmp,$out0,$twk1 # last block prep for stealing + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_zero: + cmpwi $taillen,0 + beq Lxts_enc6x_done + + add $inp,$inp,$taillen + subi $inp,$inp,16 + lvx_u $in0,0,$inp + lvsr $inpperm,0,$taillen # $in5 is no more + le?vperm $in0,$in0,$in0,$leperm + vperm $in0,$in0,$in0,$inpperm + vxor $tmp,$tmp,$twk0 +Lxts_enc6x_steal: + vxor $in0,$in0,$twk0 + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? + + subi r3,$out,17 + subi $out,$out,16 + mtctr $taillen +Loop_xts_enc6x_steal: + lbzu r0,1(r3) + stb r0,16(r3) + bdnz Loop_xts_enc6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_enc1x # one more time... + +.align 4 +Lxts_enc6x_done: + mtlr r7 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_enc5x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_enc5x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + vcipher $out0,$out0,v26 + lvsr $inpperm,r0,$taillen # $in5 is no more + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vcipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vcipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vcipher $out1,$out1,v29 + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vcipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vcipher $out0,$out0,v30 + vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v30 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vcipher $out4,$out4,v30 + + vcipherlast $out0,$out0,$twk0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r0 + li r7,`$FRAME+8*16+15` + li r8,`$FRAME+8*16+31` + $PUSH r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r8,$sp + addi r8,r8,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r8,$sp + addi r8,r8,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r8,$sp + addi r8,r8,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r8,$sp + addi r8,r8,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r8,$sp + addi r8,r8,32 + stvx v30,r7,$sp + stvx v31,r8,$sp + mr r7,r0 + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,$FRAME+15 + mtctr $rounds + +Load_xts_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_dec_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,$FRAME+15 # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_dec6x + +.align 5 +Loop_xts_dec6x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vxor $tweak,$tweak,$tmp + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v30 + vncipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vncipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vncipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vncipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vncipherlast $out5,$out5,$in5 + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$out5,$out5,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + stvx_u $out5,$x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_dec6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_dec6x_zero + cmpwi $len,0x20 + blt Lxts_dec6x_one + nop + beq Lxts_dec6x_two + cmpwi $len,0x40 + blt Lxts_dec6x_three + nop + beq Lxts_dec6x_four + +Lxts_dec6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + vxor $twk1,$tweak,$rndkey0 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk1 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + vmr $twk1,$twk5 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk5 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + vmr $twk1,$twk4 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk4 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vmr $twk1,$twk3 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk3 + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_dec1x: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec1x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + mtctr $rounds + vncipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vmr $twk1,$twk2 + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + vxor $out0,$in0,$twk2 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_zero: + cmpwi $taillen,0 + beq Lxts_dec6x_done + + lvx_u $in0,0,$inp + le?vperm $in0,$in0,$in0,$leperm + vxor $out0,$in0,$twk1 +Lxts_dec6x_steal: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Lxts_dec6x_steal + + add $inp,$inp,$taillen + vncipher $out0,$out0,v24 + + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v26 + + lvsr $inpperm,0,$taillen # $in5 is no more + vncipher $out0,$out0,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk1,$twk1,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vncipherlast $tmp,$out0,$twk1 + + le?vperm $out0,$tmp,$tmp,$leperm + le?stvx_u $out0,0,$out + be?stvx_u $tmp,0,$out + + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 + vxor $out0,$out0,$twk0 + + subi r3,$out,1 + mtctr $taillen +Loop_xts_dec6x_steal: + lbzu r0,1(r3) + stb r0,16(r3) + bdnz Loop_xts_dec6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_dec1x # one more time... + +.align 4 +Lxts_dec6x_done: + mtlr r7 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_dec5x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_dec5x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vncipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,$FRAME+15 # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vncipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vncipher $out4,$out4,v30 + + vncipherlast $out0,$out0,$twk0 + vncipherlast $out1,$out1,$in1 + vncipherlast $out2,$out2,$in2 + vncipherlast $out3,$out3,$in3 + vncipherlast $out4,$out4,$in4 + mtctr $rounds + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +___ +}} }}} + +my $consts=1; +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + # constants table endian-specific conversion + if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$3; + my @bytes=(); + + # convert to endian-agnostic format + if ($1 eq "long") { + foreach (split(/,\s*/,$2)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + } else { + @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o or + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/le\?/#le#/o or + s/be\?//o or + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl new file mode 100644 index 0000000000..104f417c85 --- /dev/null +++ b/crypto/aes/asm/aesv8-armx.pl @@ -0,0 +1,968 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for ARMv8 AES instructions. The +# module is endian-agnostic in sense that it supports both big- and +# little-endian cases. As does it support both 32- and 64-bit modes +# of operation. Latter is achieved by limiting amount of utilized +# registers to 16, which implies additional NEON load and integer +# instructions. This has no effect on mighty Apple A7, where results +# are literally equal to the theoretical estimates based on AES +# instruction latencies and issue rates. On Cortex-A53, an in-order +# execution core, this costs up to 10-15%, which is partially +# compensated by implementing dedicated code path for 128-bit +# CBC encrypt case. On Cortex-A57 parallelizable mode performance +# seems to be limited by sheer amount of NEON instructions... +# +# Performance in cycles per byte processed with 128-bit key: +# +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A53 2.45 1.87 1.94 +# Cortex-A57 3.64 1.34 1.32 + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to +# maintain both 32- and 64-bit codes within single module and +# transliterate common code to either flavour with regex vodoo. +# +{{{ +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); + + +$code.=<<___; +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: +.Lenc_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___; + mov $ptr,#-1 + cmp $inp,#0 + b.eq .Lenc_key_abort + cmp $out,#0 + b.eq .Lenc_key_abort + mov $ptr,#-2 + cmp $bits,#128 + b.lt .Lenc_key_abort + cmp $bits,#256 + b.gt .Lenc_key_abort + tst $bits,#0x3f + b.ne .Lenc_key_abort + + adr $ptr,.Lrcon + cmp $bits,#192 + + veor $zero,$zero,$zero + vld1.8 {$in0},[$inp],#16 + mov $bits,#8 // reuse $bits + vld1.32 {$rcon,$mask},[$ptr],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + b.ne .Loop128 + + vld1.32 {$rcon},[$ptr] + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + + vtbl.8 $key,{$in0},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in0},[$out],#16 + aese $key,$zero + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + veor $in0,$in0,$key + vst1.32 {$in0},[$out] + add $out,$out,#0x50 + + mov $rounds,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {$in1},[$inp],#8 + vmov.i8 $key,#8 // borrow $key + vst1.32 {$in0},[$out],#16 + vsub.i8 $mask,$mask,$key // adjust the mask + +.Loop192: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#8 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + + vdup.32 $tmp,${in0}[3] + veor $tmp,$tmp,$in1 + veor $key,$key,$rcon + vext.8 $in1,$zero,$in1,#12 + vshl.u8 $rcon,$rcon,#1 + veor $in1,$in1,$tmp + veor $in0,$in0,$key + veor $in1,$in1,$key + vst1.32 {$in0},[$out],#16 + b.ne .Loop192 + + mov $rounds,#12 + add $out,$out,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {$in1},[$inp] + mov $bits,#7 + mov $rounds,#14 + vst1.32 {$in0},[$out],#16 + +.Loop256: + vtbl.8 $key,{$in1},$mask + vext.8 $tmp,$zero,$in0,#12 + vst1.32 {$in1},[$out],#16 + aese $key,$zero + subs $bits,$bits,#1 + + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in0,$in0,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $key,$key,$rcon + veor $in0,$in0,$tmp + vshl.u8 $rcon,$rcon,#1 + veor $in0,$in0,$key + vst1.32 {$in0},[$out],#16 + b.eq .Ldone + + vdup.32 $key,${in0}[3] // just splat + vext.8 $tmp,$zero,$in1,#12 + aese $key,$zero + + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + vext.8 $tmp,$zero,$tmp,#12 + veor $in1,$in1,$tmp + + veor $in1,$in1,$key + b .Loop256 + +.Ldone: + str $rounds,[$out] + mov $ptr,#0 + +.Lenc_key_abort: + mov x0,$ptr // return value + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key + +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + stmdb sp!,{r4,lr} +___ +$code.=<<___; + bl .Lenc_key + + cmp x0,#0 + b.ne .Ldec_key_abort + + sub $out,$out,#240 // restore original $out + mov x4,#-16 + add $inp,$out,x12,lsl#4 // end of key schedule + + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + +.Loop_imc: + vld1.32 {v0.16b},[$out] + vld1.32 {v1.16b},[$inp] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + vst1.32 {v0.16b},[$inp],x4 + vst1.32 {v1.16b},[$out],#16 + cmp $inp,$out + b.hi .Loop_imc + + vld1.32 {v0.16b},[$out] + aesimc v0.16b,v0.16b + vst1.32 {v0.16b},[$inp] + + eor x0,x0,x0 // return value +.Ldec_key_abort: +___ +$code.=<<___ if ($flavour !~ /64/); + ldmia sp!,{r4,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldp x29,x30,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} +{{{ +sub gen_block () { +my $dir = shift; +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); +my ($inp,$out,$key)=map("x$_",(0..2)); +my $rounds="w3"; +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ldr $rounds,[$key,#240] + vld1.32 {$rndkey0},[$key],#16 + vld1.8 {$inout},[$inp] + sub $rounds,$rounds,#2 + vld1.32 {$rndkey1},[$key],#16 + +.Loop_${dir}c: + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key],#16 + aes$mc $inout,$inout + subs $rounds,$rounds,#2 + aes$e $inout,$rndkey1 + vld1.32 {$rndkey1},[$key],#16 + aes$mc $inout,$inout + b.gt .Loop_${dir}c + + aes$e $inout,$rndkey0 + vld1.32 {$rndkey0},[$key] + aes$mc $inout,$inout + aes$e $inout,$rndkey1 + veor $inout,$inout,$rndkey0 + + vst1.8 {$inout},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r8,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldmia ip,{r4-r5} @ load remaining args +___ +$code.=<<___; + subs $len,$len,#16 + mov $step,#16 + b.lo .Lcbc_abort + cclr $step,eq + + cmp $enc,#0 // en- or decrypting? + ldr $rounds,[$key,#240] + and $len,$len,#-16 + vld1.8 {$ivec},[$ivp] + vld1.8 {$dat},[$inp],$step + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#6 + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys + sub $rounds,$rounds,#2 + vld1.32 {q10-q11},[$key_],#32 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + + add $key_,$key,#32 + mov $cnt,$rounds + b.eq .Lcbc_dec + + cmp $rounds,#2 + veor $dat,$dat,$ivec + veor $rndzero_n_last,q8,$rndlast + b.eq .Lcbc_enc128 + +.Loop_cbc_enc: + aese $dat,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat,$dat + subs $cnt,$cnt,#2 + aese $dat,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat,$dat + b.gt .Loop_cbc_enc + + aese $dat,q8 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,q9 + aesmc $dat,$dat + cclr $step,eq + aese $dat,q10 + aesmc $dat,$dat + add $key_,$key,#16 + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q13 + aesmc $dat,$dat + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aese $dat,q14 + aesmc $dat,$dat + aese $dat,q15 + + mov $cnt,$rounds + veor $ivec,$dat,$rndlast + vst1.8 {$ivec},[$out],#16 + b.hs .Loop_cbc_enc + + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {$in0-$in1},[$key_] + aese $dat,q8 + aesmc $dat,$dat + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese $dat,q8 + aesmc $dat,$dat + vst1.8 {$ivec},[$out],#16 +.Lenter_cbc_enc128: + aese $dat,q9 + aesmc $dat,$dat + subs $len,$len,#16 + aese $dat,$in0 + aesmc $dat,$dat + cclr $step,eq + aese $dat,$in1 + aesmc $dat,$dat + aese $dat,q10 + aesmc $dat,$dat + aese $dat,q11 + aesmc $dat,$dat + vld1.8 {q8},[$inp],$step + aese $dat,q12 + aesmc $dat,$dat + aese $dat,q13 + aesmc $dat,$dat + aese $dat,q14 + aesmc $dat,$dat + veor q8,q8,$rndzero_n_last + aese $dat,q15 + veor $ivec,$dat,$rndlast + b.hs .Loop_cbc_enc128 + + vst1.8 {$ivec},[$out],#16 + b .Lcbc_done +___ +{ +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); +$code.=<<___; +.align 5 +.Lcbc_dec: + vld1.8 {$dat2},[$inp],#16 + subs $len,$len,#32 // bias + add $cnt,$rounds,#2 + vorr $in1,$dat,$dat + vorr $dat1,$dat,$dat + vorr $in2,$dat2,$dat2 + b.lo .Lcbc_dec_tail + + vorr $dat1,$dat2,$dat2 + vld1.8 {$dat2},[$inp],#16 + vorr $in0,$dat,$dat + vorr $in1,$dat1,$dat1 + vorr $in2,$dat2,$dat2 + +.Loop3x_cbc_dec: + aesd $dat0,q8 + aesd $dat1,q8 + aesd $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + subs $cnt,$cnt,#2 + aesd $dat0,q9 + aesd $dat1,q9 + aesd $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + b.gt .Loop3x_cbc_dec + + aesd $dat0,q8 + aesd $dat1,q8 + aesd $dat2,q8 + veor $tmp0,$ivec,$rndlast + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp1,$in0,$rndlast + aesd $dat0,q9 + aesd $dat1,q9 + aesd $dat2,q9 + veor $tmp2,$in1,$rndlast + subs $len,$len,#0x30 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vorr $ivec,$in2,$in2 + mov.lo x6,$len // x6, $cnt, is zero at this point + aesd $dat0,q12 + aesd $dat1,q12 + aesd $dat2,q12 + add $inp,$inp,x6 // $inp is adjusted in such way that + // at exit from the loop $dat1-$dat2 + // are loaded with last "words" + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + mov $key_,$key + aesd $dat0,q13 + aesd $dat1,q13 + aesd $dat2,q13 + vld1.8 {$in0},[$inp],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vld1.8 {$in1},[$inp],#16 + aesd $dat0,q14 + aesd $dat1,q14 + aesd $dat2,q14 + vld1.8 {$in2},[$inp],#16 + aesimc $dat0,$dat0 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesd $dat0,q15 + aesd $dat1,q15 + aesd $dat2,q15 + + add $cnt,$rounds,#2 + veor $tmp0,$tmp0,$dat0 + veor $tmp1,$tmp1,$dat1 + veor $dat2,$dat2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vorr $dat0,$in0,$in0 + vst1.8 {$tmp0},[$out],#16 + vorr $dat1,$in1,$in1 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$dat2},[$out],#16 + vorr $dat2,$in2,$in2 + b.hs .Loop3x_cbc_dec + + cmn $len,#0x30 + b.eq .Lcbc_done + nop + +.Lcbc_dec_tail: + aesd $dat1,q8 + aesd $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + subs $cnt,$cnt,#2 + aesd $dat1,q9 + aesd $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + b.gt .Lcbc_dec_tail + + aesd $dat1,q8 + aesd $dat2,q8 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q9 + aesd $dat2,q9 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q12 + aesd $dat2,q12 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + cmn $len,#0x20 + aesd $dat1,q13 + aesd $dat2,q13 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp1,$ivec,$rndlast + aesd $dat1,q14 + aesd $dat2,q14 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp2,$in1,$rndlast + aesd $dat1,q15 + aesd $dat2,q15 + b.eq .Lcbc_dec_one + veor $tmp1,$tmp1,$dat1 + veor $tmp2,$tmp2,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$tmp2},[$out],#16 + b .Lcbc_done + +.Lcbc_dec_one: + veor $tmp1,$tmp1,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 + +.Lcbc_done: + vst1.8 {$ivec},[$ivp] +.Lcbc_abort: +___ +} +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r8,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_)=("w5","w6","x7"); +my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); +my $step="x12"; # aliases with $tctr2 + +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#4 + mov $step,#16 + cmp $len,#2 + add $key_,$key,x5,lsl#4 // pointer to last 5 round keys + sub $rounds,$rounds,#2 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + add $key_,$key,#32 + mov $cnt,$rounds + cclr $step,lo +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + vorr $dat1,$dat0,$dat0 + add $tctr1, $ctr, #1 + vorr $dat2,$dat0,$dat0 + add $ctr, $ctr, #2 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${dat1}[3],$tctr1 + b.ls .Lctr32_tail + rev $tctr2, $ctr + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + aese $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + b.gt .Loop3x_ctr32 + + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + mov $key_,$key + aesmc $tmp0,$dat0 + vld1.8 {$in0},[$inp],#16 + aesmc $tmp1,$dat1 + aesmc $dat2,$dat2 + vorr $dat0,$ivec,$ivec + aese $tmp0,q9 + vld1.8 {$in1},[$inp],#16 + aese $tmp1,q9 + aese $dat2,q9 + vorr $dat1,$ivec,$ivec + aesmc $tmp0,$tmp0 + vld1.8 {$in2},[$inp],#16 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$dat2 + vorr $dat2,$ivec,$ivec + add $tctr0,$ctr,#1 + aese $tmp0,q12 + aese $tmp1,q12 + aese $tmp2,q12 + veor $in0,$in0,$rndlast + add $tctr1,$ctr,#2 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + veor $in1,$in1,$rndlast + add $ctr,$ctr,#3 + aese $tmp0,q13 + aese $tmp1,q13 + aese $tmp2,q13 + veor $in2,$in2,$rndlast + rev $tctr0,$tctr0 + aesmc $tmp0,$tmp0 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat0}[3], $tctr0 + rev $tctr1,$tctr1 + aese $tmp0,q14 + aese $tmp1,q14 + aese $tmp2,q14 + vmov.32 ${dat1}[3], $tctr1 + rev $tctr2,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat2}[3], $tctr2 + subs $len,$len,#3 + aese $tmp0,q15 + aese $tmp1,q15 + aese $tmp2,q15 + + mov $cnt,$rounds + veor $in0,$in0,$tmp0 + veor $in1,$in1,$tmp1 + veor $in2,$in2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$in0},[$out],#16 + vst1.8 {$in1},[$out],#16 + vst1.8 {$in2},[$out],#16 + b.hs .Loop3x_ctr32 + + adds $len,$len,#3 + b.eq .Lctr32_done + cmp $len,#1 + mov $step,#16 + cclr $step,eq + +.Lctr32_tail: + aese $dat0,q8 + aese $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + b.gt .Lctr32_tail + + aese $dat0,q8 + aese $dat1,q8 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q9 + aese $dat1,q9 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + vld1.8 {$in0},[$inp],$step + aese $dat0,q12 + aese $dat1,q12 + vld1.8 {$in1},[$inp] + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q13 + aese $dat1,q13 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q14 + aese $dat1,q14 + veor $in0,$in0,$rndlast + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + veor $in1,$in1,$rndlast + aese $dat0,q15 + aese $dat1,q15 + + cmp $len,#1 + veor $in0,$in0,$dat0 + veor $in1,$in1,$dat1 + vst1.8 {$in0},[$out],#16 + b.eq .Lctr32_done + vst1.8 {$in1},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ +######################################## +if ($flavour =~ /64/) { ######## 64-bit code + my %opcode = ( + "aesd" => 0x4e285800, "aese" => 0x4e284800, + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5), + $mnemonic,$arg; + }; + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + # fix up remainig legacy suffixes + s/\.[ui]?8//o; + m/\],#8/o and s/\.16b/\.8b/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + my %opcode = ( + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); + + local *unaes = sub { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + }; + + sub unvtbl { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + } + + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + } + + foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)mov\./$1mov/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl new file mode 100644 index 0000000000..bfec664198 --- /dev/null +++ b/crypto/arm64cpuid.pl @@ -0,0 +1,68 @@ +#!/usr/bin/env perl + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code.=<<___; +#include "arm_arch.h" + +.text +.arch armv8-a+crypto + +.align 5 +.globl _armv7_neon_probe +.type _armv7_neon_probe,%function +_armv7_neon_probe: + orr v15.16b, v15.16b, v15.16b + ret +.size _armv7_neon_probe,.-_armv7_neon_probe + +.globl _armv7_tick +.type _armv7_tick,%function +_armv7_tick: +#ifdef __APPLE__ + mrs x0, CNTPCT_EL0 +#else + mrs x0, CNTVCT_EL0 +#endif + ret +.size _armv7_tick,.-_armv7_tick + +.globl _armv8_aes_probe +.type _armv8_aes_probe,%function +_armv8_aes_probe: + aese v0.16b, v0.16b + ret +.size _armv8_aes_probe,.-_armv8_aes_probe + +.globl _armv8_sha1_probe +.type _armv8_sha1_probe,%function +_armv8_sha1_probe: + sha1h s0, s0 + ret +.size _armv8_sha1_probe,.-_armv8_sha1_probe + +.globl _armv8_sha256_probe +.type _armv8_sha256_probe,%function +_armv8_sha256_probe: + sha256su0 v0.4s, v0.4s + ret +.size _armv8_sha256_probe,.-_armv8_sha256_probe +.globl _armv8_pmull_probe +.type _armv8_pmull_probe,%function +_armv8_pmull_probe: + pmull v0.1q, v0.1d, v0.1d + ret +.size _armv8_pmull_probe,.-_armv8_pmull_probe +___ + +print $code; +close STDOUT; diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index a50c366976..7a377758eb 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -10,13 +10,22 @@ # define __ARMEL__ # endif # elif defined(__GNUC__) +# if defined(__aarch64__) +# define __ARM_ARCH__ 8 +# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ +# define __ARMEB__ +# else +# define __ARMEL__ +# endif /* * Why doesn't gcc define __ARM_ARCH__? Instead it defines * bunch of below macros. See all_architectires[] table in * gcc/config/arm/arm.c. On a side note it defines * __ARMEL__/__ARMEB__ for little-/big-endian. */ -# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ +# elif defined(__ARM_ARCH_8A__) +# define __ARM_ARCH__ 8 +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ defined(__ARM_ARCH_7EM__) # define __ARM_ARCH__ 7 @@ -42,10 +51,14 @@ #if !__ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; +#endif #define ARMV7_NEON (1<<0) #define ARMV7_TICK (1<<1) -#endif +#define ARMV8_AES (1<<2) +#define ARMV8_SHA1 (1<<3) +#define ARMV8_SHA256 (1<<4) +#define ARMV8_PMULL (1<<5) #endif #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 8dbd741087..2579389ffd 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -20,6 +20,10 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } */ void _armv7_neon_probe(void); unsigned int _armv7_tick(void); +void _armv8_aes_probe(void); +void _armv8_sha1_probe(void); +void _armv8_sha256_probe(void); +void _armv8_pmull_probe(void); unsigned int OPENSSL_rdtsc(void) { @@ -30,7 +34,7 @@ unsigned int OPENSSL_rdtsc(void) } #if defined(__GNUC__) && __GNUC__>=2 -void OPENSSL_cpuid_setup(void) __attribute__((constructor)) +void OPENSSL_cpuid_setup(void) __attribute__((constructor)); #endif void OPENSSL_cpuid_setup(void) { @@ -68,6 +72,28 @@ void OPENSSL_cpuid_setup(void) { _armv7_neon_probe(); OPENSSL_armcap_P |= ARMV7_NEON; +#ifdef __aarch64__ + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_pmull_probe(); + OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES; + } + else if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_aes_probe(); + OPENSSL_armcap_P |= ARMV8_AES; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_sha1_probe(); + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv8_sha256_probe(); + OPENSSL_armcap_P |= ARMV8_SHA256; + } +#endif } if (sigsetjmp(ill_jmp,1) == 0) { diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S index c9102ca2a5..2d618deaa4 100644 --- a/crypto/armv4cpuid.S +++ b/crypto/armv4cpuid.S @@ -44,7 +44,7 @@ OPENSSL_atomic_add: bne .Lspin ldr r2,[r4] - add r2,r5 + add r2,r2,r5 str r2,[r4] str r0,[r6] @ release spinlock ldmia sp!,{r4-r6,lr} @@ -59,26 +59,26 @@ OPENSSL_atomic_add: OPENSSL_cleanse: eor ip,ip,ip cmp r1,#7 - subhs r1,#4 + subhs r1,r1,#4 bhs .Lot cmp r1,#0 beq .Lcleanse_done .Little: strb ip,[r0],#1 - subs r1,#1 + subs r1,r1,#1 bhi .Little b .Lcleanse_done .Lot: tst r0,#3 beq .Laligned strb ip,[r0],#1 - sub r1,#1 + sub r1,r1,#1 b .Lot .Laligned: str ip,[r0],#4 - subs r1,#4 + subs r1,r1,#4 bhs .Laligned - adds r1,#4 + adds r1,r1,#4 bne .Little .Lcleanse_done: tst lr,#1 diff --git a/crypto/armv4cpuid_ios.S b/crypto/armv4cpuid_ios.S new file mode 100644 index 0000000000..cce9a7902b --- /dev/null +++ b/crypto/armv4cpuid_ios.S @@ -0,0 +1,210 @@ +#include "arm_arch.h" + +.text +.code 32 + +.align 5 +.globl _OPENSSL_atomic_add + +_OPENSSL_atomic_add: +#if __ARM_ARCH__>=6 +Ladd: ldrex r2,[r0] + add r3,r2,r1 + strex r2,r3,[r0] + cmp r2,#0 + bne Ladd + mov r0,r3 + bx lr +#else + stmdb sp!,{r4,r5,r6,lr} + ldr r2,Lspinlock + adr r3,Lspinlock + mov r4,r0 + mov r5,r1 + add r6,r3,r2 @ &spinlock + b .+8 +Lspin: bl sched_yield + mov r0,#-1 + swp r0,r0,[r6] + cmp r0,#0 + bne Lspin + + ldr r2,[r4] + add r2,r2,r5 + str r2,[r4] + str r0,[r6] @ release spinlock + ldmia sp!,{r4,r5,r6,lr} + tst lr,#1 + moveq pc,lr +.word 0xe12fff1e @ bx lr +#endif + + +.globl _OPENSSL_cleanse + +_OPENSSL_cleanse: + eor ip,ip,ip + cmp r1,#7 + subhs r1,r1,#4 + bhs Lot + cmp r1,#0 + beq Lcleanse_done +Little: + strb ip,[r0],#1 + subs r1,r1,#1 + bhi Little + b Lcleanse_done + +Lot: tst r0,#3 + beq Laligned + strb ip,[r0],#1 + sub r1,r1,#1 + b Lot +Laligned: + str ip,[r0],#4 + subs r1,r1,#4 + bhs Laligned + adds r1,r1,#4 + bne Little +Lcleanse_done: +#if __ARM_ARCH__>=5 + bx lr +#else + tst lr,#1 + moveq pc,lr +.word 0xe12fff1e @ bx lr +#endif + + + + +.align 5 +.globl __armv7_neon_probe + +__armv7_neon_probe: + vorr q0,q0,q0 + bx lr + + +.globl __armv7_tick + +__armv7_tick: +#ifdef __APPLE__ + mrrc p15,0,r0,r1,c14 @ CNTPCT +#else + mrrc p15,1,r0,r1,c14 @ CNTVCT +#endif + bx lr + + +.globl __armv8_aes_probe + +__armv8_aes_probe: +.byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 + bx lr + + +.globl __armv8_sha1_probe + +__armv8_sha1_probe: +.byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 + bx lr + + +.globl __armv8_sha256_probe + +__armv8_sha256_probe: +.byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 + bx lr + +.globl __armv8_pmull_probe + +__armv8_pmull_probe: +.byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 + bx lr + +.globl _OPENSSL_wipe_cpu + +_OPENSSL_wipe_cpu: + ldr r0,LOPENSSL_armcap + adr r1,LOPENSSL_armcap + ldr r0,[r1,r0] +#ifdef __APPLE__ + ldr r0,[r0] +#endif + eor r2,r2,r2 + eor r3,r3,r3 + eor ip,ip,ip + tst r0,#1 + beq Lwipe_done + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + veor q12, q12, q12 + veor q13, q13, q13 + veor q14, q14, q14 + veor q15, q15, q15 +Lwipe_done: + mov r0,sp +#if __ARM_ARCH__>=5 + bx lr +#else + tst lr,#1 + moveq pc,lr +.word 0xe12fff1e @ bx lr +#endif + + +.globl _OPENSSL_instrument_bus + +_OPENSSL_instrument_bus: + eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else + tst lr,#1 + moveq pc,lr +.word 0xe12fff1e @ bx lr +#endif + + +.globl _OPENSSL_instrument_bus2 + +_OPENSSL_instrument_bus2: + eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else + tst lr,#1 + moveq pc,lr +.word 0xe12fff1e @ bx lr +#endif + + +.align 5 +LOPENSSL_armcap: +.word OPENSSL_armcap_P-. +#if __ARM_ARCH__>=6 +.align 5 +#else +Lspinlock: +.word atomic_add_spinlock-Lspinlock +.align 5 + +.data +.align 2 +atomic_add_spinlock: +.word +#endif + +.comm _OPENSSL_armcap_P,4 +.non_lazy_symbol_pointer +OPENSSL_armcap_P: +.indirect_symbol _OPENSSL_armcap_P +.long 0 +.private_extern _OPENSSL_armcap_P diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index 9928dae872..737659f0db 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -21,8 +21,20 @@ # runs in even less cycles, ~30, improvement is measurable only on # longer keys. One has to optimize code elsewhere to get NEON glow... -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } @@ -170,11 +182,18 @@ bn_GF2m_mul_2x2: #if __ARM_ARCH__>=7 ldr r12,.LOPENSSL_armcap .Lpic: ldr r12,[pc,r12] +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#1 beq .Lialu veor $A1,$A1 +#ifdef __APPLE__ + vmov $B1,r3,r3 @ two copies of b1 +#else vmov.32 $B1,r3,r3 @ two copies of b1 +#endif vmov.32 ${A1}[0],r1 @ a1 veor $A0,$A0 @@ -218,38 +237,38 @@ $code.=<<___; mov $b,r3 @ $b=b1 ldr r3,[sp,#32] @ load b0 mov $mask,#7<<2 - sub sp,#32 @ allocate tab[8] + sub sp,sp,#32 @ allocate tab[8] bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] str $hi,[$ret,#12] - eor $b,r3 @ flip b0 and b1 - eor $a,r2 @ flip a0 and a1 - eor r3,$b - eor r2,$a - eor $b,r3 - eor $a,r2 + eor $b,$b,r3 @ flip b0 and b1 + eor $a,$a,r2 @ flip a0 and a1 + eor r3,r3,$b + eor r2,r2,$a + eor $b,$b,r3 + eor $a,$a,r2 bl mul_1x1_ialu @ a0·b0 str $lo,[$ret] str $hi,[$ret,#4] - eor $a,r2 - eor $b,r3 + eor $a,$a,r2 + eor $b,$b,r3 bl mul_1x1_ialu @ (a1+a0)·(b1+b0) ___ @r=map("r$_",(6..9)); $code.=<<___; ldmia $ret,{@r[0]-@r[3]} - eor $lo,$hi - eor $hi,@r[1] - eor $lo,@r[0] - eor $hi,@r[2] - eor $lo,@r[3] - eor $hi,@r[3] + eor $lo,$lo,$hi + eor $hi,$hi,@r[1] + eor $lo,$lo,@r[0] + eor $hi,$hi,@r[2] + eor $lo,$lo,@r[3] + eor $hi,$hi,@r[3] str $hi,[$ret,#8] - eor $lo,$hi - add sp,#32 @ destroy tab[8] + eor $lo,$lo,$hi + add sp,sp,#32 @ destroy tab[8] str $lo,[$ret,#4] #if __ARM_ARCH__>=5 diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index f78a8b5f0f..aa00f38c2f 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -23,8 +23,20 @@ # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm new file mode 100644 index 0000000000..161547c3b0 --- /dev/null +++ b/crypto/bn/asm/bn-c64xplus.asm @@ -0,0 +1,333 @@ +;;==================================================================== +;; Written by Andy Polyakov for the OpenSSL +;; project. +;; +;; Rights for redistribution and usage in source and binary forms are +;; granted according to the OpenSSL license. Warranty of any kind is +;; disclaimed. +;;==================================================================== +;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n +;; being the number of 32-bit words, addition - 8*n. Corresponding 4x +;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler +;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. +;;==================================================================== + .text + + .asg B3,RA + .asg A4,ARG0 + .asg B4,ARG1 + .asg A6,ARG2 + .asg B6,ARG3 + .asg A8,ARG4 + .asg B8,ARG5 + .asg A4,RET + .asg A15,FP + .asg B14,DP + .asg B15,SP + + .global _bn_mul_add_words +_bn_mul_add_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator +|| [B0] MV ARG0,A2 +|| [B0] MV ARG3,A3 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 3 + LDW *ARG0++,A7 ; rp[i] + MPY32U B7,A3,A17:A16 + NOP 3 ; [2,0] in epilogue + ADDU A16,A7,A21:A20 + ADDU A19,A21:A20,A19:A18 +|| MV.S A17,A23 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*A2++ ; rp[i] +|| ADD A19,A23,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_mul_words +_bn_mul_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,A7 ; ap[i] + NOP 4 + MPY32U A7,ARG3,A17:A16 + NOP 4 ; [2,0] in epiloque + ADDU A19,A16,A19:A18 +|| MV.S A17,A21 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*ARG0++ ; rp[i] +|| ADD.L A19,A21,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_sqr_words +_bn_sqr_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] MV ARG0,B2 +|| [B0] ADD 4,ARG0,ARG0 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 4 + MPY32U B7,B7,B1:B0 + NOP 3 ; [2,0] in epilogue + STW B0,*B2++(8) ; rp[2*i] + MV B1,A1 + SPKERNEL 2,0 ; fully overlap BNOP RA,5 +|| STW A1,*ARG0++(8) ; rp[2*i+1] +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_add_words +_bn_add_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A1 ; carry flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + ADDU A7,B7,A9:A8 + ADDU A1,A9:A8,A1:A0 + SPKERNEL 0,0 ; fully overlap BNOP RA,5 +|| STW A0,*A3++ ; write result +|| MV A1,RET ; keep carry flag in RET +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_sub_words +_bn_sub_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A2 ; borrow flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + SUBU B7,A7,A1:A0 + [A2] SUB A1:A0,1,A1:A0 + SPKERNEL 0,1 ; leave slot for "return borrow flag" +|| STW A0,*A3++ ; write result +|| AND 1,A1,A2 ; pass on borrow flag +;;==================================================================== + BNOP RA,4 + AND 1,A1,RET ; return borrow flag + .endasmfunc + + .global _bn_div_words + .global __divull +_bn_div_words: + .asmfunc + CALLP __divull,A3 ; jump to rts64plus.lib +|| MV ARG0,A5 +|| MV ARG1,ARG0 +|| MV ARG2,ARG1 +|| ZERO B5 + .endasmfunc + +;;==================================================================== +;; Not really Comba algorithm, just straightforward NxM... Dedicated +;; fully unrolled real Comba implementations are asymptotically 2x +;; faster, but naturally larger undertaking. Purpose of this exercise +;; was rather to learn to master nested SPLOOPs... +;;==================================================================== + .global _bn_sqr_comba8 + .global _bn_mul_comba8 +_bn_sqr_comba8: + MV ARG1,ARG2 +_bn_mul_comba8: + .asmfunc + MVK 8,B0 ; N, RILC +|| MVK 8,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; N-2, initial ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M +sploopNxM?: ; for best performance arrange M<=N + [A0] SPLOOPD 2 ; 2*n+10 +|| MVC B1,ILC +|| ADDAW B4,B0,B5 +|| ZERO B7 +|| LDW *A5++,A9 ; pre-fetch ap[1] +|| ZERO A1 +|| SUB A0,1,A0 +;;==================================================================== +;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. +;; This is because of Advisory 15 from TI publication SPRZ247I. + LDW *ARG2++,A7 ; bp[i] + NOP 3 + [A1] LDW *B5++,B7 ; rp[i] + MPY32U A7,B6,B17:B16 + NOP 3 + ADDU B16,B7,B21:B20 + ADDU B19,B21:B20,B19:B18 +|| MV.S B17,B23 + SPKERNEL +|| STW B18,*B4++ ; rp[i] +|| ADD.S B19,B23,B19 +;;==================================================================== +outer?: ; m*2*(n+1)+10 + SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] + SPMASKR +|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? + MVD A9,B6 ; move through .M unit(*) + [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] + SUBAW B5,B2,B5 ; rewind rp to rp[1] + MVK 1,A1 + [A0] BNOP.S1 outer?,4 +|| [A0] SUB.L A0,1,A0 + STW B19,*B4--[B2] ; rewind rp tp rp[1] +|| ZERO.S B19 ; high part of accumulator +;; end of outer? + BNOP RA,5 ; return + .endasmfunc +;; (*) It should be noted that B6 is used as input to MPY32U in +;; chronologically next cycle in *preceding* SPLOOP iteration. +;; Normally such arrangement would require DINT, but at this +;; point SPLOOP is draining and interrupts are disabled +;; implicitly. + + .global _bn_sqr_comba4 + .global _bn_mul_comba4 +_bn_sqr_comba4: + MV ARG1,ARG2 +_bn_mul_comba4: + .asmfunc + .if 0 + BNOP sploopNxM?,3 + ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, + ;; because of read-after-write penalties, it's rather + ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]... + MVK 4,B0 ; N, RILC +|| MVK 4,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; first ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M + .else + ;; This alternative is exercise in fully unrolled Comba + ;; algorithm implementation that operates at n*(n+1)+12, or + ;; as little as 32 cycles... + LDW *ARG1[0],B16 ; a[0] +|| LDW *ARG2[0],A16 ; b[0] + LDW *ARG1[1],B17 ; a[1] +|| LDW *ARG2[1],A17 ; b[1] + LDW *ARG1[2],B18 ; a[2] +|| LDW *ARG2[2],A18 ; b[2] + LDW *ARG1[3],B19 ; a[3] +|| LDW *ARG2[3],A19 ; b[3] + NOP + MPY32U A16,B16,A1:A0 ; a[0]*b[0] + MPY32U A17,B16,A23:A22 ; a[0]*b[1] + MPY32U A16,B17,A25:A24 ; a[1]*b[0] + MPY32U A16,B18,A27:A26 ; a[2]*b[0] + STW A0,*ARG0[0] +|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] + MPY32U A18,B16,A31:A30 ; a[0]*b[2] +|| ADDU A22,A1,A1:A0 + MV A23,B0 +|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B0,B1:B0 +|| STW A0,*ARG0[1] +|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] +|| ADDU A26,A1,A9:A8 + ADDU A27,B1,B9:B8 +|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] +|| ADDU A28,A9:A8,A9:A8 + ADDU A29,B9:B8,B9:B8 +|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] +|| ADDU A30,A9:A8,A9:A8 + ADDU A31,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[2] +|| ADDU A20,A9,A1:A0 + ADDU A21,B9,B1:B0 +|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] +|| ADDU A22,A1:A0,A1:A0 + ADDU A23,B1:B0,B1:B0 +|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B1:B0,B1:B0 +|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] +|| ADDU A26,A1:A0,A1:A0 + ADDU A27,B1:B0,B1:B0 +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[3] +|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] +|| ADDU A20,A1,A9:A8 + ADDU A21,B1,B9:B8 +|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] +|| ADDU A22,A9:A8,A9:A8 + ADDU A23,B9:B8,B9:B8 +|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] +|| ADDU A24,A9:A8,A9:A8 + ADDU A25,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[4] +|| ADDU A26,A9,A1:A0 + ADDU A27,B9,B1:B0 +|| ADDU A28,A1:A0,A1:A0 + ADDU A29,B1:B0,B1:B0 +|| BNOP RA +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[5] +|| ADDU A30,A1,A9:A8 + ADD A31,B1,B8 + ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below + ADD B8,A9,A9 +|| STW A8,*ARG0[6] + STW A9,*ARG0[7] + .endif + .endasmfunc diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl new file mode 100644 index 0000000000..cef83942c9 --- /dev/null +++ b/crypto/bn/asm/c64xplus-gf2m.pl @@ -0,0 +1,146 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# February 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... The subroutine runs in 37 cycles, which is +# 4.5x faster than compiler-generated code. Though comparison is +# totally unfair, because this module utilizes Galois Field Multiply +# instruction. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector + +($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); +($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); +($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); +($A,$B)=($Alo,$B_1); +$xFF="B1"; + +sub mul_1x1_upper { +my ($A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication +|| XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 + XORMPY $Alo,$B_0,$Alox0 +|| XORMPY $Ahi,$B_0,$Ahix0 + XORMPY $Alo,$B_3,$Alox3 +|| XORMPY $Ahi,$B_3,$Ahix3 + XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +___ +} +sub mul_1x1_merged { +my ($OUTlo,$OUThi,$A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi +|| XORMPY $Alo,$B_2,$Alox2 + XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 +|| XORMPY $Alo,$B_0,A1 ; $Alox0 + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| XORMPY $Ahi,$B_0,$Ahix0 +|| XORMPY $Alo,$B_3,$Alox3 +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| XORMPY $Ahi,$B_3,$Ahix3 +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +|| XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +|| MV A1,$Alox0 +___ +} +sub mul_1x1_lower { +my ($OUTlo,$OUThi)=@_; +$code.=<<___; + ;NOP + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi + NOP + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +___ +} +$code.=<<___; + .text + + .global _bn_GF2m_mul_2x2 +_bn_GF2m_mul_2x2: + .asmfunc + MVK 0xFF,$xFF +___ + &mul_1x1_upper($a0,$b0); # a0·b0 +$code.=<<___; +|| MV $b1,$B + MV $a1,$A +___ + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 +$code.=<<___; +|| XOR $b0,$b1,$B + XOR $a0,$a1,$A +___ + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) +$code.=<<___; + XOR A28,A31,A29 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 +___ + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) +$code.=<<___; +|| BNOP B3 + XOR A29,A30,A30 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 + XOR B28,A30,A30 +|| STW A28,*${rp}[0] + XOR B30,A31,A31 +|| STW A30,*${rp}[1] + STW A31,*${rp}[2] + STW B31,*${rp}[3] + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl index b944a12b8e..a33cdf4111 100644 --- a/crypto/bn/asm/mips-mont.pl +++ b/crypto/bn/asm/mips-mont.pl @@ -46,7 +46,7 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 @@ -133,7 +133,7 @@ $code.=<<___; bnez $at,1f li $t0,0 slt $at,$num,17 # on in-order CPU - bnezl $at,bn_mul_mont_internal + bnez $at,bn_mul_mont_internal nop 1: jr $ra li $a0,0 diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl index acfd35968e..acafde5e56 100644 --- a/crypto/bn/asm/mips.pl +++ b/crypto/bn/asm/mips.pl @@ -48,7 +48,7 @@ # has to content with 40-85% improvement depending on benchmark and # key length, more for longer keys. -$flavour = shift; +$flavour = shift || "o32"; while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -140,10 +140,10 @@ $code.=<<___; .set reorder li $minus4,-4 and $ta0,$a2,$minus4 - $LD $t0,0($a1) beqz $ta0,.L_bn_mul_add_words_tail .L_bn_mul_add_words_loop: + $LD $t0,0($a1) $MULTU $t0,$a3 $LD $t1,0($a0) $LD $t2,$BNSZ($a1) @@ -200,10 +200,9 @@ $code.=<<___; $ADDU $v0,$ta2 sltu $at,$ta3,$at $ST $ta3,-$BNSZ($a0) - $ADDU $v0,$at .set noreorder - bgtzl $ta0,.L_bn_mul_add_words_loop - $LD $t0,0($a1) + bgtz $ta0,.L_bn_mul_add_words_loop + $ADDU $v0,$at beqz $a2,.L_bn_mul_add_words_return nop @@ -267,7 +266,7 @@ ___ $code.=<<___; jr $ra move $a0,$v0 -.end bn_mul_add_words +.end bn_mul_add_words_internal .align 5 .globl bn_mul_words @@ -300,10 +299,10 @@ $code.=<<___; .set reorder li $minus4,-4 and $ta0,$a2,$minus4 - $LD $t0,0($a1) beqz $ta0,.L_bn_mul_words_tail .L_bn_mul_words_loop: + $LD $t0,0($a1) $MULTU $t0,$a3 $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) @@ -341,10 +340,9 @@ $code.=<<___; $ADDU $v0,$at sltu $ta3,$v0,$at $ST $v0,-$BNSZ($a0) - $ADDU $v0,$ta3,$ta2 .set noreorder - bgtzl $ta0,.L_bn_mul_words_loop - $LD $t0,0($a1) + bgtz $ta0,.L_bn_mul_words_loop + $ADDU $v0,$ta3,$ta2 beqz $a2,.L_bn_mul_words_return nop @@ -429,10 +427,10 @@ $code.=<<___; .set reorder li $minus4,-4 and $ta0,$a2,$minus4 - $LD $t0,0($a1) beqz $ta0,.L_bn_sqr_words_tail .L_bn_sqr_words_loop: + $LD $t0,0($a1) $MULTU $t0,$t0 $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) @@ -463,11 +461,10 @@ $code.=<<___; mflo $ta3 mfhi $ta2 $ST $ta3,-2*$BNSZ($a0) - $ST $ta2,-$BNSZ($a0) .set noreorder - bgtzl $ta0,.L_bn_sqr_words_loop - $LD $t0,0($a1) + bgtz $ta0,.L_bn_sqr_words_loop + $ST $ta2,-$BNSZ($a0) beqz $a2,.L_bn_sqr_words_return nop @@ -547,10 +544,10 @@ $code.=<<___; .set reorder li $minus4,-4 and $at,$a3,$minus4 - $LD $t0,0($a1) beqz $at,.L_bn_add_words_tail .L_bn_add_words_loop: + $LD $t0,0($a1) $LD $ta0,0($a2) subu $a3,4 $LD $t1,$BNSZ($a1) @@ -589,11 +586,10 @@ $code.=<<___; $ADDU $t3,$ta3,$v0 sltu $v0,$t3,$ta3 $ST $t3,-$BNSZ($a0) - $ADDU $v0,$t9 .set noreorder - bgtzl $at,.L_bn_add_words_loop - $LD $t0,0($a1) + bgtz $at,.L_bn_add_words_loop + $ADDU $v0,$t9 beqz $a3,.L_bn_add_words_return nop @@ -679,10 +675,10 @@ $code.=<<___; .set reorder li $minus4,-4 and $at,$a3,$minus4 - $LD $t0,0($a1) beqz $at,.L_bn_sub_words_tail .L_bn_sub_words_loop: + $LD $t0,0($a1) $LD $ta0,0($a2) subu $a3,4 $LD $t1,$BNSZ($a1) @@ -722,11 +718,10 @@ $code.=<<___; $SUBU $t3,$ta3,$v0 sgtu $v0,$t3,$ta3 $ST $t3,-$BNSZ($a0) - $ADDU $v0,$t9 .set noreorder - bgtzl $at,.L_bn_sub_words_loop - $LD $t0,0($a1) + bgtz $at,.L_bn_sub_words_loop + $ADDU $v0,$t9 beqz $a3,.L_bn_sub_words_return nop @@ -778,7 +773,7 @@ ___ $code.=<<___; jr $ra move $a0,$v0 -.end bn_sub_words +.end bn_sub_words_internal .align 5 .globl bn_div_3_words @@ -819,7 +814,7 @@ ___ $code.=<<___; .set reorder move $ta3,$ra - bal bn_div_words + bal bn_div_words_internal move $ra,$ta3 $MULTU $ta2,$v0 $LD $t2,-2*$BNSZ($a3) @@ -840,8 +835,9 @@ $code.=<<___; sltu $ta0,$a1,$a2 or $t8,$ta0 .set noreorder - beqzl $at,.L_bn_div_3_words_inner_loop + beqz $at,.L_bn_div_3_words_inner_loop $SUBU $v0,1 + $ADDU $v0,1 .set reorder .L_bn_div_3_words_inner_loop_done: .set noreorder @@ -902,7 +898,8 @@ $code.=<<___; and $t2,$a0 $SRL $at,$a1,$t1 .set noreorder - bnezl $t2,.+8 + beqz $t2,.+12 + nop break 6 # signal overflow .set reorder $SLL $a0,$t9 @@ -917,7 +914,8 @@ $code.=<<___; $SRL $DH,$a2,4*$BNSZ # bits sgeu $at,$a0,$a2 .set noreorder - bnezl $at,.+8 + beqz $at,.+12 + nop $SUBU $a0,$a2 .set reorder @@ -1874,6 +1872,41 @@ ___ ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); +sub add_c2 () { +my ($hi,$lo,$c0,$c1,$c2, + $warm, # !$warm denotes first call with specific sequence of + # $c_[XYZ] when there is no Z-carry to accumulate yet; + $an,$bn # these two are arguments for multiplication which + # result is used in *next* step [which is why it's + # commented as "forward multiplication" below]; + )=@_; +$code.=<<___; + mflo $lo + mfhi $hi + $ADDU $c0,$lo + sltu $at,$c0,$lo + $MULTU $an,$bn # forward multiplication + $ADDU $c0,$lo + $ADDU $at,$hi + sltu $lo,$c0,$lo + $ADDU $c1,$at + $ADDU $hi,$lo +___ +$code.=<<___ if (!$warm); + sltu $c2,$c1,$at + $ADDU $c1,$hi + sltu $hi,$c1,$hi + $ADDU $c2,$hi +___ +$code.=<<___ if ($warm); + sltu $at,$c1,$at + $ADDU $c1,$hi + $ADDU $c2,$at + sltu $hi,$c1,$hi + $ADDU $c2,$hi +___ +} + $code.=<<___; .align 5 @@ -1922,21 +1955,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 @@ -1947,67 +1969,19 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_3,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at +___ + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, + $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); +$code.=<<___; $ST $c_1,3*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_1,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_1,$at - $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at +___ + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, + $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, + $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 @@ -2018,97 +1992,23 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_2,$at - $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); - $ADDU $c_2,$at - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, + $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, + $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); +$code.=<<___; $ST $c_3,5*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_3,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at +___ + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, + $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 @@ -2119,112 +2019,25 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_1,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_1,$at - $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_1,$at - $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_1,$at - $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at +___ + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, + $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, + $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, + $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, + $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); +$code.=<<___; $ST $c_2,7*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_2,$at - $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_2,$at - $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, + $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, + $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 @@ -2235,82 +2048,21 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_3,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at +___ + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, + $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); +$code.=<<___; $ST $c_1,9*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_1,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_1,$at - $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at +___ + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, + $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, + $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 @@ -2321,52 +2073,17 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_2,$at - $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, + $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); +$code.=<<___; $ST $c_3,11*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_3,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at +___ + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, + $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_1,$t_1 @@ -2377,21 +2094,10 @@ $code.=<<___; sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_1,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at +___ + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, + $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); +$code.=<<___; $ST $c_2,13*$BNSZ($a0) mflo $t_1 @@ -2459,21 +2165,10 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_3,$t_1 @@ -2484,52 +2179,17 @@ $code.=<<___; sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_3,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 - slt $at,$t_2,$zero - $ADDU $c_3,$at - $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); - $SLL $t_2,1 - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_1,$t_1 - sltu $at,$c_1,$t_1 - $ADDU $t_2,$at - $ADDU $c_2,$t_2 - sltu $at,$c_2,$t_2 - $ADDU $c_3,$at +___ + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, + $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); + &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, + $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); +$code.=<<___; $ST $c_1,3*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_1,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_2,$t_1 - sltu $at,$c_2,$t_1 - $ADDU $t_2,$at - $ADDU $c_3,$t_2 - sltu $at,$c_3,$t_2 - $ADDU $c_1,$at +___ + &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, + $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); +$code.=<<___; mflo $t_1 mfhi $t_2 $ADDU $c_2,$t_1 @@ -2540,21 +2200,10 @@ $code.=<<___; sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) - - mflo $t_1 - mfhi $t_2 - slt $c_2,$t_2,$zero - $SLL $t_2,1 - $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); - slt $a2,$t_1,$zero - $ADDU $t_2,$a2 - $SLL $t_1,1 - $ADDU $c_3,$t_1 - sltu $at,$c_3,$t_1 - $ADDU $t_2,$at - $ADDU $c_1,$t_2 - sltu $at,$c_1,$t_2 - $ADDU $c_2,$at +___ + &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, + $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); +$code.=<<___; $ST $c_3,5*$BNSZ($a0) mflo $t_1 diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index f9b6992ccc..420f4d5807 100644 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -191,7 +191,7 @@ L1st: addi $j,$j,$BNSZ ; j++ addi $tp,$tp,$BNSZ ; tp++ - bdnz- L1st + bdnz L1st ;L1st addc $lo0,$alo,$hi0 addze $hi0,$ahi @@ -253,7 +253,7 @@ Linner: addze $hi1,$hi1 $ST $lo1,0($tp) ; tp[j-1] addi $tp,$tp,$BNSZ ; tp++ - bdnz- Linner + bdnz Linner ;Linner $LD $tj,$BNSZ($tp) ; tp[j] addc $lo0,$alo,$hi0 @@ -276,7 +276,7 @@ Linner: slwi $tj,$num,`log($BNSZ)/log(2)` $UCMP $i,$tj addi $i,$i,$BNSZ - ble- Louter + ble Louter addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] @@ -289,7 +289,7 @@ Lsub: $LDX $tj,$tp,$j subfe $aj,$nj,$tj ; tp[j]-np[j] $STX $aj,$rp,$j addi $j,$j,$BNSZ - bdnz- Lsub + bdnz Lsub li $j,0 mtctr $num @@ -304,7 +304,7 @@ Lcopy: ; copy or in-place refresh $STX $tj,$rp,$j $STX $j,$tp,$j ; zap at once addi $j,$j,$BNSZ - bdnz- Lcopy + bdnz Lcopy $POP $tj,0($sp) li r3,1 diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl index aaf669a5b3..5e22cd8fc6 100644 --- a/crypto/bn/asm/ppc.pl +++ b/crypto/bn/asm/ppc.pl @@ -952,7 +952,7 @@ $data=<>BN_NIST_521_LSHIFT) @@ -1113,6 +1122,10 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, return 1; } +#ifdef _WIN32_WCE +#pragma optimize( "", on ) +#endif + int (*BN_nist_mod_func(const BIGNUM *p))(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx) { if (BN_ucmp(&_bignum_nist_p_192, p) == 0) diff --git a/crypto/c64xcpuid.pl b/crypto/c64xcpuid.pl new file mode 100644 index 0000000000..88fd153b98 --- /dev/null +++ b/crypto/c64xcpuid.pl @@ -0,0 +1,326 @@ +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg OPENSSL_rdtsc,_OPENSSL_rdtsc + .asg OPENSSL_cleanse,_OPENSSL_cleanse + .asg CRYPTO_memcmp,_CRYPTO_memcmp + .asg OPENSSL_atomic_add,_OPENSSL_atomic_add + .asg OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu + .asg OPENSSL_instrument_bus,_OPENSSL_instrument_bus + .asg OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2 + .endif + + .asg B3,RA + .asg 0x01AC0000,TIMER_BASE ; Timer 2 + + .global _OPENSSL_rdtsc +_OPENSSL_rdtsc: + .asmfunc + MVKL TIMER_BASE,A5 + MVKH TIMER_BASE,A5 + LDW *A5[0],A2 ; load CTL + LDW *A5[2],A4 ; load CTN + NOP 2 + .if .BIG_ENDIAN + MVK 0x2c0,A7 ; internal clock source, don't hold, go +|| MVK -1,A6 ; maximum period + .else + MVK 0x2c0,A6 ; internal clock source, don't hold, go +|| MVK -1,A7 ; maximum period + .endif + [!A2] STDW A7:A6,*A5[0] ; fire it up +|| BNOP RA,5 + .endasmfunc + + .global _OPENSSL_cleanse +_OPENSSL_cleanse: + .asmfunc + ZERO A3:A2 +|| ZERO B2 +|| SHRU B4,3,B0 ; is length >= 8 +|| ADD 1,A4,B6 + [!B0] BNOP RA +|| [B0] SUB B0,1,B2 +|| ZERO A1 +|| ZERO B1 + [B2] BDEC cleanse_loop?,B2 +||[!B0] CMPLT 0,B4,A1 +||[!B0] CMPLT 1,B4,B1 +|| ZERO B5 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B2] BDEC cleanse_loop?,B2 +||[!B0] CMPLT 2,B4,A1 +||[!B0] CMPLT 3,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B2] BDEC cleanse_loop?,B2 +||[!B0] CMPLT 4,B4,A1 +||[!B0] CMPLT 5,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B2] BDEC cleanse_loop?,B2 +||[!B0] CMPLT 6,B4,A1 + [A1] STB A2,*A4++[2] +|| [B2] BDEC cleanse_loop?,B2 + +cleanse_loop?: + STNDW A3:A2,*A4++ +|| SUB B4,8,B4 +|| [B2] BDEC cleanse_loop?,B2 + + MV B4,B0 ; remaining bytes +|| ADD 1,A4,B6 +|| BNOP RA + [B0] CMPLT 0,B0,A1 +|| [B0] CMPLT 1,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B0] CMPLT 2,B0,A1 +|| [B0] CMPLT 3,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B0] CMPLT 4,B0,A1 +|| [B0] CMPLT 5,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B5,*B6++[2] +|| [B0] CMPLT 6,B0,A1 + [A1] STB A2,*A4++[2] + .endasmfunc + + .if 0 + .global _CRYPTO_memcmp +_CRYPTO_memcmp: + .asmfunc + MV A6,B0 + [!B0] BNOP RA +||[!B0] ZERO A4 +|| [B0] ZERO A1:A0 + [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + XOR A5,B5,A1 +|| [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + +memcmp_loop?: + OR A1,A0,A0 +|| XOR A5,B5,A1 +|| [B0] LDBU *A4++,A5 +|| [B0] LDBU *B4++,B5 +|| [B0] BDEC memcmp_loop?,B0 + + BNOP RA,3 + ZERO A4 + [A0] MVK 1,A4 + .endasmfunc + .endif + + .global _OPENSSL_atomic_add +_OPENSSL_atomic_add: + .asmfunc + BNOP atomic_store? ; pre-C64x+ systems are uni-processor, it's +|| LDW *A4,B5 ; enough to hold interrupts off through + ; the load-update-store cycle to achieve + ; atomicity + NOP + BNOP RA,3 ; and this branch stretches even over store + ADD B4,B5,B5 +atomic_store?: + STW B5,*A4 +|| MV B5,A4 + .endasmfunc + + .global _OPENSSL_wipe_cpu +_OPENSSL_wipe_cpu: + .asmfunc + ZERO A0 +|| ZERO B0 +|| ZERO A1 +|| ZERO B1 + ZERO A3:A2 +|| MVD B0,B2 +|| ZERO A4 +|| ZERO B4 +|| ZERO A5 +|| ZERO B5 +|| BNOP RA + ZERO A7:A6 +|| ZERO B7:B6 +|| ZERO A8 +|| ZERO B8 +|| ZERO A9 +|| ZERO B9 + ZERO A17:A16 +|| ZERO B17:B16 +|| ZERO A18 +|| ZERO B18 +|| ZERO A19 +|| ZERO B19 + ZERO A21:A20 +|| ZERO B21:B20 +|| ZERO A22 +|| ZERO B22 +|| ZERO A23 +|| ZERO B23 + ZERO A25:A24 +|| ZERO B25:B24 +|| ZERO A26 +|| ZERO B26 +|| ZERO A27 +|| ZERO B27 + ZERO A29:A28 +|| ZERO B29:B28 +|| ZERO A30 +|| ZERO B30 +|| ZERO A31 +|| ZERO B31 + .endasmfunc + +CLFLUSH .macro CONTROL,ADDR,LEN + B passthrough? +|| STW ADDR,*CONTROL[0] + STW LEN,*CONTROL[1] +spinlock?: + LDW *CONTROL[1],A0 + NOP 3 +passthrough?: + NOP + [A0] BNOP spinlock?,5 + .endm + + .global _OPENSSL_instrument_bus +_OPENSSL_instrument_bus: + .asmfunc + MV B4,B0 ; reassign sizeof(output) +|| MV A4,B4 ; reassign output +|| MVK 0x00004030,A3 +|| MVKL TIMER_BASE,B16 + MV B0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR +|| MVKH TIMER_BASE,B16 + LDW *B16[2],B8 ; collect 1st tick +|| MVK 0x00004010,A5 + NOP 4 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LDW *B4,B5 + NOP 4 + ADD B7,B5,B5 + STW B5,*B4 +bus_loop1?: + LDW *B16[2],B8 +|| [B0] SUB B0,1,B0 + NOP 4 + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LDW *B4,B5 + NOP 4 + ADD B7,B5,B5 + STW B5,*B4 ; [!B1] is removed to flatten samples +|| ADDK 4,B4 +|| [B0] BNOP bus_loop1?,5 + + BNOP RA,5 + .endasmfunc + + .global _OPENSSL_instrument_bus2 +_OPENSSL_instrument_bus2: + .asmfunc + MV A6,B0 ; reassign max +|| MV B4,A6 ; reassing sizeof(output) +|| MVK 0x00004030,A3 +|| MVKL TIMER_BASE,B16 + MV A4,B4 ; reassign output +|| MVK 0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR +|| MVKH TIMER_BASE,B16 + + LDW *B16[2],B8 ; collect 1st tick +|| MVK 0x00004010,A5 + NOP 4 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LDW *B4,B5 + NOP 4 + ADD B7,B5,B5 + STW B5,*B4 + + LDW *B16[2],B8 ; collect 1st diff + NOP 4 + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick +|| SUB B0,1,B0 +bus_loop2?: + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LDW *B4,B5 + NOP 4 + ADD B7,B5,B5 + STW B5,*B4 ; [!B1] is removed to flatten samples +||[!B0] BNOP bus_loop2_done?,2 +|| SUB B0,1,B0 + LDW *B16[2],B8 + NOP 4 + SUB B8,B9,B8 +|| MV B8,B9 + CMPEQ B8,B7,B2 +|| MV B8,B7 + [!B2] ADDAW B4,1,B4 +||[!B2] ADDK 1,A4 + CMPEQ A4,A6,A2 + [!A2] BNOP bus_loop2?,5 + +bus_loop2_done?: + BNOP RA,5 + .endasmfunc + + .if __TI_EABI__ + .sect ".init_array" + .else + .sect ".pinit" + .endif + .align 4 + .long _OPENSSL_rdtsc ; auto-start timer +___ + +print $code; +close STDOUT; diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xpluscpuid.pl new file mode 100644 index 0000000000..067b693d5c --- /dev/null +++ b/crypto/c64xpluscpuid.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +# + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$code.=<<___; + .text + + .asg B3,RA + + .global _OPENSSL_rdtsc +_OPENSSL_rdtsc: + .asmfunc + B RA + MVC TSCL,B0 + MVC TSCH,B1 + [!B0] MVC B0,TSCL ; start TSC + MV B0,A4 + MV B1,A5 + .endasmfunc + + .global _OPENSSL_cleanse +_OPENSSL_cleanse: + .asmfunc + ZERO A3:A2 +|| ZERO B2 +|| SHRU B4,3,B0 ; is length >= 8 +|| ADD 1,A4,B6 + [!B0] BNOP RA +|| ZERO A1 +|| ZERO B1 + [B0] MVC B0,ILC +||[!B0] CMPLT 0,B4,A1 +||[!B0] CMPLT 1,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 2,B4,A1 +||[!B0] CMPLT 3,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 4,B4,A1 +||[!B0] CMPLT 5,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 6,B4,A1 + [A1] STB A2,*A4++[2] + + SPLOOP 1 + STNDW A3:A2,*A4++ +|| SUB B4,8,B4 + SPKERNEL + + MV B4,B0 ; remaining bytes +|| ADD 1,A4,B6 +|| BNOP RA + [B0] CMPLT 0,B0,A1 +|| [B0] CMPLT 1,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 2,B0,A1 +|| [B0] CMPLT 3,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 4,B0,A1 +|| [B0] CMPLT 5,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 6,B0,A1 + [A1] STB A2,*A4++[2] + .endasmfunc + + .global _OPENSSL_atomic_add +_OPENSSL_atomic_add: + .asmfunc + MV A4,B0 +atomic_add?: + LL *B0,B5 + NOP 4 + ADD B4,B5,B5 + SL B5,*B0 + CMTL *B0,B1 + NOP 4 + [!B1] B atomic_add? + [B1] BNOP RA,4 + MV B5,A4 + .endasmfunc + + .global _OPENSSL_wipe_cpu +_OPENSSL_wipe_cpu: + .asmfunc + ZERO A0 +|| ZERO B0 +|| ZERO A1 +|| ZERO B1 + ZERO A3:A2 +|| MVD B0,B2 +|| ZERO A4 +|| ZERO B4 +|| ZERO A5 +|| ZERO B5 +|| BNOP RA + ZERO A7:A6 +|| ZERO B7:B6 +|| ZERO A8 +|| ZERO B8 +|| ZERO A9 +|| ZERO B9 + ZERO A17:A16 +|| ZERO B17:B16 +|| ZERO A18 +|| ZERO B18 +|| ZERO A19 +|| ZERO B19 + ZERO A21:A20 +|| ZERO B21:B20 +|| ZERO A22 +|| ZERO B22 +|| ZERO A23 +|| ZERO B23 + ZERO A25:A24 +|| ZERO B25:B24 +|| ZERO A26 +|| ZERO B26 +|| ZERO A27 +|| ZERO B27 + ZERO A29:A28 +|| ZERO B29:B28 +|| ZERO A30 +|| ZERO B30 +|| ZERO A31 +|| ZERO B31 + .endasmfunc + +CLFLUSH .macro CONTROL,ADDR,LEN + B passthrough? +|| STW ADDR,*CONTROL[0] + STW LEN,*CONTROL[1] +spinlock?: + LDW *CONTROL[1],A0 + NOP 3 +passthrough?: + NOP + [A0] BNOP spinlock?,5 + .endm + + .global _OPENSSL_instrument_bus +_OPENSSL_instrument_bus: + .asmfunc + MV B4,B0 ; reassign sizeof(output) +|| MV A4,B4 ; reassign output +|| MVK 0x00004030,A3 + MV B0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR + MVC TSCL,B8 ; collect 1st tick +|| MVK 0x00004010,A5 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + NOP 4 + STW B5,*B4 +bus_loop1?: + MVC TSCL,B8 +|| [B0] SUB B0,1,B0 + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + STW B5,*B4 ; [!B1] is removed to flatten samples +|| ADDK 4,B4 +|| [B0] BNOP bus_loop1?,5 + + BNOP RA,5 + .endasmfunc + + .global _OPENSSL_instrument_bus2 +_OPENSSL_instrument_bus2: + .asmfunc + MV A6,B0 ; reassign max +|| MV B4,A6 ; reassing sizeof(output) +|| MVK 0x00004030,A3 + MV A4,B4 ; reassign output +|| MVK 0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR + + MVC TSCL,B8 ; collect 1st tick +|| MVK 0x00004010,A5 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + NOP 4 + STW B5,*B4 + + MVC TSCL,B8 ; collect 1st diff + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick +|| SUB B0,1,B0 +bus_loop2?: + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + STW B5,*B4 ; [!B1] is removed to flatten samples +||[!B0] BNOP bus_loop2_done?,2 +|| SUB B0,1,B0 + MVC TSCL,B8 + SUB B8,B9,B8 +|| MV B8,B9 + CMPEQ B8,B7,B2 +|| MV B8,B7 + [!B2] ADDAW B4,1,B4 +||[!B2] ADDK 1,A4 + CMPEQ A4,A6,A2 + [!A2] BNOP bus_loop2?,5 + +bus_loop2_done?: + BNOP RA,5 + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c index 5807e30ddd..5ff0fa7028 100644 --- a/crypto/cmac/cmac.c +++ b/crypto/cmac/cmac.c @@ -77,19 +77,17 @@ struct CMAC_CTX_st /* Make temporary keys K1 and K2 */ -static void make_kn(unsigned char *k1, unsigned char *l, int bl) +static void make_kn(unsigned char *k1, const unsigned char *l, int bl) { int i; + unsigned char c = l[0], carry = c>>7, cnext; + /* Shift block to left, including carry */ - for (i = 0; i < bl; i++) - { - k1[i] = l[i] << 1; - if (i < bl - 1 && l[i + 1] & 0x80) - k1[i] |= 1; - } + for (i = 0; i < bl-1; i++, c = cnext) + k1[i] = (c << 1) | ((cnext=l[i+1]) >> 7); + /* If MSB set fixup with R */ - if (l[0] & 0x80) - k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b; + k1[i] = (c << 1) ^ ((0-carry)&(bl==16?0x87:0x1b)); } CMAC_CTX *CMAC_CTX_new(void) @@ -143,7 +141,8 @@ int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, const EVP_CIPHER *cipher, ENGINE *impl) { - static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH]; + __fips_constseg + static const unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH] = {0}; /* All zeros means restart */ if (!key && !cipher && !impl && keylen == 0) { @@ -152,6 +151,8 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, return 0; if (!M_EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) return 0; + memset(ctx->tbl, 0, M_EVP_CIPHER_CTX_block_size(&ctx->cctx)); + ctx->nlast_block = 0; return 1; } /* Initialiase context */ diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c index 524daf037d..cf96011cc5 100644 --- a/crypto/cryptlib.c +++ b/crypto/cryptlib.c @@ -297,7 +297,7 @@ void OPENSSL_showfatal (const char *fmta,...) DWORD out; va_start (ap,fmta); - len=_vsnprintf((char *)buf,sizeof(buf),fmt,ap); + len=_vsnprintf((char *)buf,sizeof(buf),fmta,ap); WriteFile(h,buf,len<0?sizeof(buf):(DWORD)len,&out,NULL); va_end (ap); return; @@ -359,7 +359,15 @@ void OPENSSL_showfatal (const char *fmta,...) { va_list ap; va_start (ap,fmta); +#if defined(OPENSSL_SYS_VXWORKS) + { + char buf[256]; + vsnprintf(buf,sizeof(buf),fmta,ap); + printf("%s",buf); + } +#else vfprintf (stderr,fmta,ap); +#endif va_end (ap); } int OPENSSL_isservice (void) { return 0; } @@ -374,7 +382,9 @@ void OpenSSLDie(const char *file,int line,const char *assertion) abort(); #else /* Win32 abort() customarily shows a dialog, but we just did that... */ +#ifdef SIGABRT raise(SIGABRT); +#endif _exit(3); #endif } diff --git a/crypto/des/spr.h b/crypto/des/spr.h index 9be0dce9f6..35e71a5118 100644 --- a/crypto/des/spr.h +++ b/crypto/des/spr.h @@ -56,6 +56,9 @@ * [including the GNU Public Licence.] */ +#ifdef _TMS320C6X +# pragma DATA_SECTION(DES_SPtrans,".const:des_sptrans") +#endif __fips_constseg OPENSSL_GLOBAL const DES_LONG DES_SPtrans[8][64]={ { diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h index 86766dacb4..408ee11b72 100644 --- a/crypto/dsa/dsa.h +++ b/crypto/dsa/dsa.h @@ -215,6 +215,11 @@ DSA_SIG * FIPS_dsa_sign_ctx(DSA *dsa, EVP_MD_CTX *ctx); int FIPS_dsa_verify_digest(DSA *dsa, const unsigned char *dig, int dlen, DSA_SIG *s); int FIPS_dsa_verify_ctx(DSA *dsa, EVP_MD_CTX *ctx, DSA_SIG *s); +int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash, DSA_SIG *s); +DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash); + #endif DSA * DSA_new(void); diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c index d5f4debc92..9e3e57a828 100644 --- a/crypto/dsa/dsa_gen.c +++ b/crypto/dsa/dsa_gen.c @@ -666,7 +666,13 @@ int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N, /* "offset = offset + n + 1" */ /* step 14 */ - if (counter >= 4096) break; + if (counter >= (int)(4 * L)) break; + } + if (seed_in) + { + ok = 0; + DSAerr(DSA_F_DSA_BUILTIN_PARAMGEN2, DSA_R_INVALID_PARAMETERS); + goto err; } } end: diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c index f37347b5e1..9a9476f0c1 100644 --- a/crypto/ec/ec2_smpl.c +++ b/crypto/ec/ec2_smpl.c @@ -556,7 +556,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_ field_sqr = group->meth->field_sqr; /* only support affine coordinates */ - if (!point->Z_is_one) goto err; + if (!point->Z_is_one) return -1; if (ctx == NULL) { diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c index f3331e1ce5..24ae707560 100644 --- a/crypto/ec/ec_key.c +++ b/crypto/ec/ec_key.c @@ -511,10 +511,12 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y) tx, ty, ctx)) goto err; } - /* Check if retrieved coordinates match originals: if not values - * are out of range. + /* Check if retrieved coordinates match originals and are less than + * field order: if not values are out of range. */ - if (BN_cmp(x, tx) || BN_cmp(y, ty)) + if (BN_cmp(x, tx) || BN_cmp(y, ty) + || (BN_cmp(x, &key->group->field) >= 0) + || (BN_cmp(y, &key->group->field) >= 0)) { ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, EC_R_COORDINATES_OUT_OF_RANGE); diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h index b4b58ee65b..8ac82b8cbd 100644 --- a/crypto/ecdh/ecdh.h +++ b/crypto/ecdh/ecdh.h @@ -85,6 +85,8 @@ extern "C" { #endif +#define EC_FLAG_COFACTOR_ECDH 0x1000 + const ECDH_METHOD *ECDH_OpenSSL(void); void ECDH_set_default_method(const ECDH_METHOD *); diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c index 94a8f4b696..2656797449 100644 --- a/crypto/ecdh/ech_ossl.c +++ b/crypto/ecdh/ech_ossl.c @@ -146,6 +146,18 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key, } group = EC_KEY_get0_group(ecdh); + + if (EC_KEY_get_flags(ecdh) & EC_FLAG_COFACTOR_ECDH) + { + if (!EC_GROUP_get_cofactor(group, x, ctx) || + !BN_mul(x, x, priv_key, ctx)) + { + ECDHerr(ECDH_F_ECDH_COMPUTE_KEY, ERR_R_MALLOC_FAILURE); + goto err; + } + priv_key = x; + } + if ((tmp=EC_POINT_new(group)) == NULL) { ECDHerr(ECDH_F_ECDH_COMPUTE_KEY,ERR_R_MALLOC_FAILURE); diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h index c3275b0839..cd6d19ccde 100644 --- a/crypto/ecdsa/ecdsa.h +++ b/crypto/ecdsa/ecdsa.h @@ -236,6 +236,11 @@ ECDSA_SIG * FIPS_ecdsa_sign_ctx(EC_KEY *key, EVP_MD_CTX *ctx); int FIPS_ecdsa_verify_digest(EC_KEY *key, const unsigned char *dig, int dlen, ECDSA_SIG *s); int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s); +int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash, ECDSA_SIG *s); +ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key, + const unsigned char *msg, size_t msglen, + const EVP_MD *mhash); #endif diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c index 429255d215..6f77e7e4b9 100644 --- a/crypto/evp/e_aes.c +++ b/crypto/evp/e_aes.c @@ -89,6 +89,10 @@ typedef struct { AES_KEY ks1, ks2; /* AES key schedules to use */ XTS128_CONTEXT xts; + void (*stream)(const unsigned char *in, + unsigned char *out, size_t length, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); } EVP_AES_XTS_CTX; typedef struct @@ -123,6 +127,9 @@ void vpaes_cbc_encrypt(const unsigned char *in, unsigned char *ivec, int enc); #endif #ifdef BSAES_ASM +void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, + unsigned char ivec[16], int enc); void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const AES_KEY *key, const unsigned char ivec[16]); @@ -133,6 +140,19 @@ void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, const unsigned char ivec[AES_BLOCK_SIZE]); #endif +#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) +extern int OPENSSL_ppccap_P; +# define HWAES_CAPABLE (OPENSSL_ppccap_P & (1<<2)) +# define HWAES_set_encrypt_key aes_p8_set_encrypt_key +# define HWAES_set_decrypt_key aes_p8_set_decrypt_key +# define HWAES_encrypt aes_p8_encrypt +# define HWAES_decrypt aes_p8_decrypt +# define HWAES_cbc_encrypt aes_p8_cbc_encrypt +# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks +# define HWAES_xts_encrypt aes_p8_xts_encrypt +# define HWAES_xts_decrypt aes_p8_xts_decrypt +#endif + #if defined(AES_ASM) && !defined(I386_ONLY) && ( \ ((defined(__i386) || defined(__i386__) || \ defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \ @@ -337,11 +357,13 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, { aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); xctx->xts.block1 = (block128_f)aesni_encrypt; + xctx->stream = aesni_xts_encrypt; } else { aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); xctx->xts.block1 = (block128_f)aesni_decrypt; + xctx->stream = aesni_xts_decrypt; } aesni_set_encrypt_key(key + ctx->key_len/2, @@ -360,32 +382,9 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; } +#define aesni_xts_cipher aes_xts_cipher static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t len) - { - EVP_AES_XTS_CTX *xctx = ctx->cipher_data; - if (!xctx->xts.key1 || !xctx->xts.key2) - return -1; - if (!out || !in) - return -1; -#ifdef OPENSSL_FIPS - /* Requirement of SP800-38E */ - if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) && - (len > (1L<<20)*16)) - { - EVPerr(EVP_F_AESNI_XTS_CIPHER, EVP_R_TOO_LARGE); - return -1; - } -#endif - if (ctx->encrypt) - aesni_xts_encrypt(in, out, len, - xctx->xts.key1, xctx->xts.key2, ctx->iv); - else - aesni_xts_decrypt(in, out, len, - xctx->xts.key1, xctx->xts.key2, ctx->iv); - - return len; - } + const unsigned char *in, size_t len); static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) @@ -485,6 +484,42 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ { return &aes_##keylen##_##mode; } #endif +#if defined(OPENSSL_CPUID_OBJ) && defined(__aarch64__) +#include "arm_arch.h" +#if __ARM_ARCH__>=7 +# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# define HWAES_set_encrypt_key aes_v8_set_encrypt_key +# define HWAES_set_decrypt_key aes_v8_set_decrypt_key +# define HWAES_encrypt aes_v8_encrypt +# define HWAES_decrypt aes_v8_decrypt +# define HWAES_cbc_encrypt aes_v8_cbc_encrypt +# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks +#endif +#endif + +#if defined(HWAES_CAPABLE) +int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +void HWAES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void HWAES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, + unsigned char *ivec, const int enc); +void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const AES_KEY *key, const unsigned char ivec[16]); +void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out, + size_t len, const AES_KEY *key1, + const AES_KEY *key2, const unsigned char iv[16]); +void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out, + size_t len, const AES_KEY *key1, + const AES_KEY *key2, const unsigned char iv[16]); + +#endif + #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ @@ -503,6 +538,28 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, mode = ctx->cipher->flags & EVP_CIPH_MODE; if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)HWAES_decrypt; + dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt + if (mode==EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; +#endif + } + else +#endif +#ifdef BSAES_CAPABLE + if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE) + { + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)AES_decrypt; + dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt; + } + else +#endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { @@ -522,6 +579,26 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, NULL; } else +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)HWAES_encrypt; + dat->stream.cbc = NULL; +#ifdef HWAES_cbc_encrypt + if (mode==EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; + else +#endif +#ifdef HWAES_ctr32_encrypt_blocks + if (mode==EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; + else +#endif + (void)0; /* terminate potentially open 'else' */ + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE) { @@ -800,6 +877,28 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) /* Extra padding: tag appended to record */ return EVP_GCM_TLS_TAG_LEN; + case EVP_CTRL_COPY: + { + EVP_CIPHER_CTX *out = ptr; + EVP_AES_GCM_CTX *gctx_out = out->cipher_data; + if (gctx->gcm.key) + { + if (gctx->gcm.key != &gctx->ks) + return 0; + gctx_out->gcm.key = &gctx_out->ks; + } + if (gctx->iv == c->iv) + gctx_out->iv = out->iv; + else + { + gctx_out->iv = OPENSSL_malloc(gctx->ivlen); + if (!gctx_out->iv) + return 0; + memcpy(gctx_out->iv, gctx->iv, gctx->ivlen); + } + return 1; + } + default: return -1; @@ -814,6 +913,21 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) { do { +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)HWAES_encrypt); +#ifdef HWAES_ctr32_encrypt_blocks + gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; +#else + gctx->ctr = NULL; +#endif + break; + } + else +#endif #ifdef BSAES_CAPABLE if (BSAES_CAPABLE) { @@ -961,8 +1075,6 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, if (!gctx->iv_set) return -1; - if (!ctx->encrypt && gctx->taglen < 0) - return -1; if (in) { if (out == NULL) @@ -1004,6 +1116,8 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { if (!ctx->encrypt) { + if (gctx->taglen < 0) + return -1; if (CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen) != 0) return -1; @@ -1021,7 +1135,8 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, #define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \ | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \ - | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT) + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ + | EVP_CIPH_CUSTOM_COPY) BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM, EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS) @@ -1033,7 +1148,25 @@ BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM, static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { EVP_AES_XTS_CTX *xctx = c->cipher_data; - if (type != EVP_CTRL_INIT) + if (type == EVP_CTRL_COPY) + { + EVP_CIPHER_CTX *out = ptr; + EVP_AES_XTS_CTX *xctx_out = out->cipher_data; + if (xctx->xts.key1) + { + if (xctx->xts.key1 != &xctx->ks1) + return 0; + xctx_out->xts.key1 = &xctx_out->ks1; + } + if (xctx->xts.key2) + { + if (xctx->xts.key2 != &xctx->ks2) + return 0; + xctx_out->xts.key2 = &xctx_out->ks2; + } + return 1; + } + else if (type != EVP_CTRL_INIT) return -1; /* key1 and key2 are used as an indicator both key and IV are set */ xctx->xts.key1 = NULL; @@ -1050,7 +1183,37 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, if (key) do { + xctx->stream = NULL; /* key_len is two AES keys */ +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + if (enc) + { + HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)HWAES_encrypt; +#ifdef HWAES_xts_encrypt + xctx->stream = HWAES_xts_encrypt; +#endif + } + else + { + HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)HWAES_decrypt; +#ifdef HWAES_xts_decrypt + xctx->stream = HWAES_xts_decrypt; +#endif + } + + HWAES_set_encrypt_key(key + ctx->key_len/2, + ctx->key_len * 4, &xctx->ks2); + xctx->xts.block2 = (block128_f)HWAES_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } + else +#endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { @@ -1105,28 +1268,32 @@ static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { EVP_AES_XTS_CTX *xctx = ctx->cipher_data; if (!xctx->xts.key1 || !xctx->xts.key2) - return -1; + return 0; if (!out || !in) - return -1; + return 0; #ifdef OPENSSL_FIPS /* Requirement of SP800-38E */ if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) && - (len > (1L<<20)*16)) + (len > (1UL<<20)*16)) { EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE); - return -1; + return 0; } #endif - if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len, + if (xctx->stream) + (*xctx->stream)(in, out, len, + xctx->xts.key1, xctx->xts.key2, ctx->iv); + else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len, ctx->encrypt)) - return -1; - return len; + return 0; + return 1; } #define aes_xts_cleanup NULL #define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ - | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT) + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ + | EVP_CIPH_CUSTOM_COPY) BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS) BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS) @@ -1176,6 +1343,19 @@ static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) cctx->len_set = 0; return 1; + case EVP_CTRL_COPY: + { + EVP_CIPHER_CTX *out = ptr; + EVP_AES_CCM_CTX *cctx_out = out->cipher_data; + if (cctx->ccm.key) + { + if (cctx->ccm.key != &cctx->ks) + return 0; + cctx_out->ccm.key = &cctx_out->ks; + } + return 1; + } + default: return -1; @@ -1190,12 +1370,26 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; if (key) do { +#ifdef HWAES_CAPABLE + if (HWAES_CAPABLE) + { + HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks); + + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)HWAES_encrypt); + cctx->str = NULL; + cctx->key_set = 1; + break; + } + else +#endif #ifdef VPAES_CAPABLE if (VPAES_CAPABLE) { vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, &cctx->ks, (block128_f)vpaes_encrypt); + cctx->str = NULL; cctx->key_set = 1; break; } diff --git a/crypto/evp/evp_locl.h b/crypto/evp/evp_locl.h index 94162d6419..6d1753522a 100644 --- a/crypto/evp/evp_locl.h +++ b/crypto/evp/evp_locl.h @@ -75,7 +75,7 @@ static int cname##_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const uns return 1;\ } -#define EVP_MAXCHUNK ((size_t)1<<(sizeof(long)*8-2)) +#define EVP_MAXCHUNK ((size_t)1<<(sizeof(int)*8-2)) #define BLOCK_CIPHER_func_ofb(cname, cprefix, cbits, kstruct, ksched) \ static int cname##_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) \ diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile index 811969304f..5a170498bf 100644 --- a/crypto/modes/Makefile +++ b/crypto/modes/Makefile @@ -56,11 +56,16 @@ ghash-alpha.s: asm/ghash-alpha.pl $(PERL) $< | $(CC) -E - | tee $@ > /dev/null ghash-parisc.s: asm/ghash-parisc.pl $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ +ghashv8-armx.S: asm/ghashv8-armx.pl + $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@ +ghashp8-ppc.s: asm/ghashp8-ppc.pl + $(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@ # GNU make "catch all" ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ ghash-armv4.o: ghash-armv4.S +ghashv8-armx.o: ghashv8-armx.S files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index d91586ee29..3799b2b559 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -57,8 +57,20 @@ # *native* byte order on current platform. See gcm128.c for working # example... -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $Xi="r0"; # argument block $Htbl="r1"; @@ -112,6 +124,11 @@ $code=<<___; .text .code 32 +#ifdef __APPLE__ +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif + .type rem_4bit,%object .align 5 rem_4bit: @@ -326,9 +343,9 @@ $code.=<<___; .align 4 gcm_gmult_neon: sub $Htbl,#16 @ point at H in GCM128_CTX - vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi + vld1.64 `&Dhi("$IN")`,[$Xi]! @ load Xi vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$IN")`,[$Xi,:64]! + vld1.64 `&Dlo("$IN")`,[$Xi]! vshr.u64 $mod,#32 vldmia $Htbl,{$Hhi-$Hlo} @ load H veor $zero,$zero @@ -349,9 +366,9 @@ gcm_gmult_neon: .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: - vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi + vld1.64 `&Dhi("$Z")`,[$Xi]! @ load Xi vmov.i32 $mod,#0xe1 @ our irreducible polynomial - vld1.64 `&Dlo("$Z")`,[$Xi,:64]! + vld1.64 `&Dlo("$Z")`,[$Xi]! vshr.u64 $mod,#32 vldmia $Xi,{$Hhi-$Hlo} @ load H veor $zero,$zero @@ -410,8 +427,8 @@ gcm_ghash_neon: vrev64.8 $Z,$Z #endif sub $Xi,#16 - vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi - vst1.64 `&Dlo("$Z")`,[$Xi,:64] + vst1.64 `&Dhi("$Z")`,[$Xi]! @ write out Xi + vst1.64 `&Dlo("$Z")`,[$Xi] bx lr .size gcm_ghash_neon,.-gcm_ghash_neon diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl new file mode 100644 index 0000000000..1ac4d927d0 --- /dev/null +++ b/crypto/modes/asm/ghash-c64xplus.pl @@ -0,0 +1,231 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2011 +# +# The module implements GCM GHASH function and underlying single +# multiplication operation in GF(2^128). Even though subroutines +# have _4bit suffix, they are not using any tables, but rely on +# hardware Galois Field Multiply support. Streamed GHASH processes +# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven +# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are +# comparing apples vs. oranges, but compiler surely could have done +# better, because theoretical [though not necessarily achievable] +# estimate for "4-bit" table-driven implementation is ~12 cycles. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments + +($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, + $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); +($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, + $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); +($FF000000,$E10000)=("B30","B31"); +($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len + $xia="A9"; +($rem,$res)=("B4","B5"); # $rem zaps $Htable + +$code.=<<___; + .text + + .asg B3,RA + + .if 0 + .global _gcm_gmult_1bit +_gcm_gmult_1bit: + ADDAD $Htable,2,$Htable + .endif + .global _gcm_gmult_4bit +_gcm_gmult_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| LDBU *++${xip}[15],$x1 ; Xi[15] + MVK 0xFF,$FF000000 +|| LDBU *--${xip},$x0 ; Xi[14] + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial + SHL $FF000000,24,$FF000000 ; upper byte mask +|| BNOP ghash_loop? +|| MVK 1,B0 ; take a single spin + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u +|| ZERO $Z1:$Z0 + SHRU2 $xia,8,$H01u +|| ZERO $Z3:$Z2 + .endasmfunc + + .global _gcm_ghash_4bit +_gcm_ghash_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo +|| SHRU $len,4,B0 ; reassign len + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| [B0] LDNDW *${inp}[1],$H1x:$H0x + MVK 0xFF,$FF000000 +|| [B0] LDNDW *${inp}++[2],$H3x:$H2x + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial +|| LDDW *${xip}[1],$Z1:$Z0 + SHL $FF000000,24,$FF000000 ; upper byte mask +|| LDDW *${xip}[0],$Z3:$Z2 + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u + SHRU2 $xia,8,$H01u + +|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + .if .LITTLE_ENDIAN + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + +ghash_loop?: + SPLOOPD 6 ; 6*16+7 +|| MVC B1,ILC +|| [B0] SUB B0,1,B0 +|| ZERO A0 +|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib +|| SHL $x1,1,$xia +___ + +########____________________________ +# 0 D2. M1 M2 | +# 1 M1 | +# 2 M1 M2 | +# 3 D1. M1 M2 | +# 4 S1. L1 | +# 5 S2 S1x L1 D2 L2 |____________________________ +# 6/0 L1 S1 L2 S2x |D2. M1 M2 | +# 7/1 L1 S1 D1x S2 M2 | M1 | +# 8/2 S1 L1x S2 | M1 M2 | +# 9/3 S1 L1x | D1. M1 M2 | +# 10/4 D1x | S1. L1 | +# 11/5 |S2 S1x L1 D2 L2 |____________ +# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... +# 7/1 L1 S1 D1x S2 M2 | .... +# 8/2 S1 L1x S2 | .... +#####... ................|............ +$code.=<<___; + XORMPY $H0,$xia,$H0x ; 0 ; H·Xi[i] +|| XORMPY $H01u,$xib,$H01y +|| [A0] LDBU *--${xip},$x0 + XORMPY $H1,$xia,$H1x ; 1 + XORMPY $H2,$xia,$H2x ; 2 +|| XORMPY $H2u,$xib,$H2y + XORMPY $H3,$xia,$H3x ; 3 +|| XORMPY $H3u,$xib,$H3y +||[!A0] MVK.D 15,A0 ; *--${xip} counter + XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·Xi[i] +|| [A0] SUB.S A0,1,A0 + XOR.L $H1x,$Z1,$Z1 ; 5 +|| AND.D $H01y,$FF000000,$H0z +|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y +|| SHL $x0,1,$xib +|| SHL $x0,1,$xia + + XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue +|| SHL $Z0,1,$rem ; ; rem=Z<<1 +|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 +|| AND.L $H1y,$FF000000,$H1z + XOR.L $H3x,$Z3,$Z3 ; 7/1 +|| SHRMB.S $Z2,$Z1,$Z1 +|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products +|| AND.S $H2y,$FF000000,$H2z +|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE + XOR.L $H1z,$Z1,$Z1 ; 8/2 +|| SHRMB.S $Z3,$Z2,$Z2 +|| AND.S $H3y,$FF000000,$H3z + XOR.L $H2z,$Z2,$Z2 ; 9/3 +|| SHRU $Z3,8,$Z3 + XOR.D $H3z,$Z3,$Z3 ; 10/4 + NOP ; 11/5 + + SPKERNEL 0,2 +|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res + + ; input pre-fetch is possible where D1 slot is available... + [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- + [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- + NOP ; 10/- + .if .LITTLE_ENDIAN + SWAP2 $Z0,$Z1 ; 11/- +|| SWAP4 $Z1,$Z0 + SWAP4 $Z1,$Z1 ; 12/- +|| SWAP2 $Z0,$Z0 + SWAP2 $Z2,$Z3 +|| SWAP4 $Z3,$Z2 +||[!B0] BNOP RA + SWAP4 $Z3,$Z3 +|| SWAP2 $Z2,$Z2 +|| [B0] BNOP ghash_loop? + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [!B0] BNOP RA ; 11/- + [B0] BNOP ghash_loop? ; 12/- + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + .endasmfunc + + .sect .const + .cstring "GHASH for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl new file mode 100755 index 0000000000..82bf125eb1 --- /dev/null +++ b/crypto/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,663 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +# May 2016 +# +# 2x aggregated reduction improves performance by 50% (resulting +# performance on POWER8 is 1 cycle per processed byte), and 4x +# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; + $UCMP="cmpld"; + $SHRI="srdi"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; + $UCMP="cmplw"; + $SHRI="srwi"; +} else { die "nonsense $flavour"; } + +$sp="r1"; +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 +.align 5 +.gcm_init_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $IN,$H,$t1 # twisted H + + vsldoi $H,$IN,$IN,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + li r8,0x40 + stvx_u $H, r9,r3 + li r9,0x50 + stvx_u $Hh,r10,r3 + li r10,0x60 + + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $IN1,$Xl,$t1 + + vsldoi $H2,$IN1,$IN1,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $H2l,r8,r3 # save H^2 + li r8,0x70 + stvx_u $H2,r9,r3 + li r9,0x80 + stvx_u $H2h,r10,r3 + li r10,0x90 +___ +{ +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); +$code.=<<___; + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vsldoi $t4,$Xm1,$zero,8 + vsldoi $t5,$zero,$Xm1,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + vxor $Xl1,$Xl1,$t4 + vxor $Xh1,$Xh1,$t5 + + vsldoi $Xl,$Xl,$Xl,8 + vsldoi $Xl1,$Xl1,$Xl1,8 + vxor $Xl,$Xl,$t2 + vxor $Xl1,$Xl1,$t6 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vpmsumd $Xl1,$Xl1,$xC2 + vxor $t1,$t1,$Xh + vxor $t5,$t5,$Xh1 + vxor $Xl,$Xl,$t1 + vxor $Xl1,$Xl1,$t5 + + vsldoi $H,$Xl,$Xl,8 + vsldoi $H2,$Xl1,$Xl1,8 + vsldoi $Hl,$zero,$H,8 + vsldoi $Hh,$H,$zero,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $Hl,r8,r3 # save H^3 + li r8,0xa0 + stvx_u $H,r9,r3 + li r9,0xb0 + stvx_u $Hh,r10,r3 + li r10,0xc0 + stvx_u $H2l,r8,r3 # save H^4 + stvx_u $H2,r9,r3 + stvx_u $H2h,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 +___ +} +$code.=<<___; +.globl .gcm_gmult_p8 +.align 5 +.gcm_gmult_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 +.align 5 +.gcm_ghash_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + li r8,0x40 + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + li r9,0x50 + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + li r10,0x60 + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + ${UCMP}i $len,64 + bge Lgcm_ghash_p8_4x + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subic. $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + beq Lshort + + lvx_u $H2l,r8,$Htbl # load H^2 + li r8,16 + lvx_u $H2, r9,$Htbl + add r9,$inp,$len # end of input + lvx_u $H2h,r10,$Htbl + be?b Loop_2x + +.align 5 +Loop_2x: + lvx_u $IN1,0,$inp + le?vperm $IN1,$IN1,$IN1,$lemask + + subic $len,$len,32 + vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo + vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo + subfe r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi + vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi + vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi + add $inp,$inp,r0 + + vxor $Xl,$Xl,$Xl1 + vxor $Xm,$Xm,$Xm1 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh1 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,r8,$inp + addi $inp,$inp,32 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + $UCMP r9,$inp + bgt Loop_2x # done yet? + + cmplwi $len,0 + bne Leven + +Lshort: + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + +Leven: + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +___ +{ +my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h, + $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31)); +my $IN0=$IN; +my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h); + +$code.=<<___; +.align 5 +.gcm_ghash_p8_4x: +Lgcm_ghash_p8_4x: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + li r10,0x60 + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME-4`($sp) # save vrsave + mtspr 256,r0 # preserve all AltiVec registers + + lvsl $t0,0,r8 # 0x0001..0e0f + #lvx_u $H2l,r8,$Htbl # load H^2 + li r8,0x70 + lvx_u $H2, r9,$Htbl + li r9,0x80 + vspltisb $t1,8 # 0x0808..0808 + #lvx_u $H2h,r10,$Htbl + li r10,0x90 + lvx_u $H3l,r8,$Htbl # load H^3 + li r8,0xa0 + lvx_u $H3, r9,$Htbl + li r9,0xb0 + lvx_u $H3h,r10,$Htbl + li r10,0xc0 + lvx_u $H4l,r8,$Htbl # load H^4 + li r8,0x10 + lvx_u $H4, r9,$Htbl + li r9,0x20 + lvx_u $H4h,r10,$Htbl + li r10,0x30 + + vsldoi $t2,$zero,$t1,8 # 0x0000..0808 + vaddubm $hiperm,$t0,$t2 # 0x0001..1617 + vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f + + $SHRI $len,$len,4 # this allows to use sign bit + # as carry + lvx_u $IN0,0,$inp # load input + lvx_u $IN1,r8,$inp + subic. $len,$len,8 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + + vxor $Xh,$IN0,$Xl + + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vperm $H21l,$H2,$H,$hiperm + vperm $t0,$IN2,$IN3,$loperm + vperm $H21h,$H2,$H,$loperm + vperm $t1,$IN2,$IN3,$hiperm + vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi + + vxor $Xm2,$Xm2,$Xm1 + vxor $Xl3,$Xl3,$Xl1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh3,$Xh3,$Xh1 + + blt Ltail_4x + +Loop_4x: + lvx_u $IN0,0,$inp + lvx_u $IN1,r8,$inp + subic. $len,$len,4 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + le?vperm $IN0,$IN0,$IN0,$lemask + + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + vxor $Xh,$Xh,$Xh3 + vperm $t0,$IN2,$IN3,$loperm + vperm $t1,$IN2,$IN3,$hiperm + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xl,$Xl,$xC2 + + vxor $Xl3,$Xl3,$Xl1 + vxor $Xh3,$Xh3,$Xh1 + vxor $Xh,$Xh,$IN0 + vxor $Xm2,$Xm2,$Xm1 + vxor $Xh,$Xh,$t1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh,$Xh,$Xl + bge Loop_4x + +Ltail_4x: + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh3 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + addic. $len,$len,4 + beq Ldone_4x + + lvx_u $IN0,0,$inp + ${UCMP}i $len,2 + li $len,-4 + blt Lone + lvx_u $IN1,r8,$inp + beq Ltwo + +Lthree: + lvx_u $IN2,r9,$inp + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + + vxor $Xh,$IN0,$Xl + vmr $H4l,$H3l + vmr $H4, $H3 + vmr $H4h,$H3h + + vperm $t0,$IN1,$IN2,$loperm + vperm $t1,$IN1,$IN2,$hiperm + vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo + vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi + + vxor $Xm3,$Xm3,$Xm2 + b Ltail_4x + +.align 4 +Ltwo: + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + + vxor $Xh,$IN0,$Xl + vperm $t0,$zero,$IN1,$loperm + vperm $t1,$zero,$IN1,$hiperm + + vsldoi $H4l,$zero,$H2,8 + vmr $H4, $H2 + vsldoi $H4h,$H2,$zero,8 + + vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo + vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi + vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi + + b Ltail_4x + +.align 4 +Lone: + le?vperm $IN0,$IN0,$IN0,$lemask + + vsldoi $H4l,$zero,$H,8 + vmr $H4, $H + vsldoi $H4h,$H,$zero,8 + + vxor $Xh,$IN0,$Xl + vxor $Xl3,$Xl3,$Xl3 + vxor $Xm3,$Xm3,$Xm3 + vxor $Xh3,$Xh3,$Xh3 + + b Ltail_4x + +Ldone_4x: + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256,$vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,0,0x80,0,4,0 + .long 0 +___ +} +$code.=<<___; +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl new file mode 100644 index 0000000000..300e8d56cb --- /dev/null +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,376 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# July 2014 +# +# Implement 2x aggregated reduction [see ghash-x86.pl for background +# information]. +# +# Current performance in cycles per processed byte: +# +# PMULL[2] 32-bit NEON(*) +# Apple A7 0.92 5.62 +# Cortex-A53 1.01 8.39 +# Cortex-A57 1.17 7.61 +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +$code.=<<___; +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {$t1},[x1] @ load H + vmov.i8 $xC2,#0xe1 + vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 + vext.8 $IN,$t1,$t1,#8 + vshr.u64 $t2,$xC2,#63 + vdup.32 $t1,${t1}[1] + vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 + vshr.u64 $t2,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t2,$t2,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t2,$t2,$t2,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t2 @ H<<<=1 + veor $H,$IN,$t0 @ twisted H + vst1.64 {$H},[x0],#16 + + @ calculate H^2 + vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing + vpmull.p64 $Xl,$H,$H + veor $t0,$t0,$H + vpmull2.p64 $Xh,$H,$H + vpmull.p64 $Xm,$t0,$t0 + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $H2,$Xl,$t2 + + vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing + veor $t1,$t1,$H2 + vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed + vst1.64 {$Hhl-$H2},[x0] + + ret +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $xC2,#0xe1 + vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... + vshl.u64 $xC2,$xC2,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $IN,$t1,$t1,#8 + + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: +___ +$code.=<<___ if ($flavour !~ /64/); + vstmdb sp!,{d8-d15} +___ +$code.=<<___; + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + subs $len,$len,#32 + vmov.i8 $xC2,#0xe1 + mov $inc,#16 + vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 + vld1.64 {$H2},[$Htbl] + cclr $inc,eq + vext.8 $Xl,$Xl,$Xl,#8 + vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] + vshl.u64 $xC2,$xC2,#57 @ 0xc2.0 +#ifndef __ARMEB__ + vrev64.8 $t0,$t0 + vrev64.8 $Xl,$Xl +#endif + vext.8 $IN,$t0,$t0,#8 + b.lo .Lodd_tail_v8 +___ +{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # +$code.=<<___; + vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $In,$t1,$t1,#8 + veor $IN,$IN,$Xl @ I[i]^=Xi + vpmull.p64 $Xln,$H,$In @ H·Ii+1 + veor $t1,$t1,$In @ Karatsuba pre-processing + vpmull2.p64 $Xhn,$H,$In + b .Loop_mod2x_v8 + +.align 4 +.Loop_mod2x_v8: + vext.8 $t2,$IN,$IN,#8 + subs $len,$len,#32 + vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo + cclr $inc,lo + + vpmull.p64 $Xmn,$Hhl,$t1 + veor $t2,$t2,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi + veor $Xl,$Xl,$Xln @ accumulate + vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i] + + veor $Xh,$Xh,$Xhn + cclr $inc,eq + veor $Xm,$Xm,$Xmn + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+1] +#ifndef __ARMEB__ + vrev64.8 $t0,$t0 +#endif + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $In,$t1,$t1,#8 + vext.8 $IN,$t0,$t0,#8 + veor $Xl,$Xm,$t2 + vpmull.p64 $Xln,$H,$In @ H·Ii+1 + veor $IN,$IN,$Xh @ accumulate $IN early + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $IN,$IN,$t2 + veor $t1,$t1,$In @ Karatsuba pre-processing + veor $IN,$IN,$Xl + vpmull2.p64 $Xhn,$H,$In + b.hs .Loop_mod2x_v8 + + veor $Xh,$Xh,$t2 + vext.8 $IN,$t0,$t0,#8 @ re-construct $IN + adds $len,$len,#32 + veor $Xl,$Xl,$Xh @ re-construct $Xl + b.eq .Ldone_v8 +___ +} +$code.=<<___; +.Lodd_tail_v8: + vext.8 $t2,$Xl,$Xl,#8 + veor $IN,$IN,$Xl @ inp^=Xi + veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi + + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + +.Ldone_v8: +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} +___ +$code.=<<___; + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remainig legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; # enforce flush diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index e638e42be8..a46ec61135 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -645,7 +645,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) #endif -#if TABLE_BITS==4 && defined(GHASH_ASM) +#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ)) # if !defined(I386_ONLY) && \ (defined(__i386) || defined(__i386__) || \ defined(__x86_64) || defined(__x86_64__) || \ @@ -666,14 +666,33 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); # endif -# elif defined(__arm__) || defined(__arm) +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__) # include "arm_arch.h" # if __ARM_ARCH__>=7 # define GHASH_ASM_ARM # define GCM_FUNCREF_4BIT +# if defined(__aarch64__) +# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL) +# endif +# if defined(__arm__) || defined(__arm) +# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) +# endif void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +void gcm_init_v8(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); # endif +# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) +# define GHASH_ASM_PPC +# define GCM_FUNCREF_4BIT +extern int OPENSSL_ppccap_P; +void gcm_init_p8(u128 Htable[16], const u64 Xi[2]); +void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]); +void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp, + size_t len); +# elif defined(_TMS320C6400_PLUS) +# define GHASH_ASM_C64Xplus # endif #endif @@ -738,14 +757,38 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) ctx->ghash = gcm_ghash_4bit; # endif # elif defined(GHASH_ASM_ARM) - if (OPENSSL_armcap_P & ARMV7_NEON) { +# ifdef PMULL_CAPABLE + if (PMULL_CAPABLE) { + gcm_init_v8(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_v8; + ctx->ghash = gcm_ghash_v8; + } else +# endif +# ifdef NEON_CAPABLE + if (NEON_CAPABLE) { ctx->gmult = gcm_gmult_neon; ctx->ghash = gcm_ghash_neon; - } else { + } else +# endif + { gcm_init_4bit(ctx->Htable,ctx->H.u); ctx->gmult = gcm_gmult_4bit; ctx->ghash = gcm_ghash_4bit; } +# elif defined(GHASH_ASM_PPC) + if (OPENSSL_ppccap_P & (1<<2)) { + gcm_init_p8(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_p8; + ctx->ghash = gcm_ghash_p8; + } else { + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; + } +# elif defined(GHASH_ASM_C64Xplus) + /* C64x+ assembler doesn't use tables, skip gcm_init_4bit. + * This is likely to trigger "function never referenced" + * warning and code being eliminated. */ # else gcm_init_4bit(ctx->Htable,ctx->H.u); # endif @@ -1397,7 +1440,7 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; #endif - if (ctx->mres) + if (ctx->mres || ctx->ares) GCM_MUL(ctx,Xi); if (is_endian.little) { diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h index 4dab6a67fe..fa5d3b02f6 100644 --- a/crypto/modes/modes_lcl.h +++ b/crypto/modes/modes_lcl.h @@ -29,10 +29,7 @@ typedef unsigned char u8; #if defined(__i386) || defined(__i386__) || \ defined(__x86_64) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) || \ - ( (defined(__arm__) || defined(__arm)) && \ - (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ - defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) + defined(__s390__) || defined(__s390x__) # undef STRICT_ALIGNMENT #endif diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl new file mode 100644 index 0000000000..22dc7e4ecc --- /dev/null +++ b/crypto/perlasm/arm-xlate.pl @@ -0,0 +1,165 @@ +#!/usr/bin/env perl + +# ARM assembler distiller by . + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +$flavour = "linux32" if (!$flavour or $flavour eq "void"); + +my %GLOBALS; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { + if ($flavour =~ /linux/) { ".arch\t".join(',',@_); } + else { ""; } +}; +my $fpu = sub { + if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); } + else { ""; } +}; +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } + else { ".hidden\t".join(',',@_); } +}; +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0"; + $name = "_$name"; + } else { $ret = ".comm\t".join(',',@args); } + + $$global = $name; + $ret; +}; +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; + last; + }; + } + + $ret = ".globl $name" if (!$ret); + $$global = $name; + $ret; +}; +my $global = $globl; +my $extern = sub { + &$globl(@_); + return; # return nothing +}; +my $type = sub { + if ($flavour =~ /linux/) { ".type\t".join(',',@_); } + else { ""; } +}; +my $size = sub { + if ($flavour =~ /linux/) { ".size\t".join(',',@_); } + else { ""; } +}; +my $inst = sub { + if ($flavour =~ /linux/) { ".inst\t".join(',',@_); } + else { ".long\t".join(',',@_); } +}; +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } + else + { ""; } +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + return $line; +} + +while($line=<>) { + + if ($line =~ m/^\s*(#|@|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + printf "%s:",($GLOBALS{$label} or $label); + } + } + + if ($line !~ m/^[#@]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg); + } + } + + print $line if ($line); + print "\n"; +} + +close STDOUT; diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl index a3edd982b6..0f46cf06bc 100755 --- a/crypto/perlasm/ppc-xlate.pl +++ b/crypto/perlasm/ppc-xlate.pl @@ -27,7 +27,8 @@ my $globl = sub { /osx/ && do { $name = "_$name"; last; }; - /linux.*32/ && do { $ret .= ".globl $name\n"; + /linux.*(32|64le)/ + && do { $ret .= ".globl $name\n"; $ret .= ".type $name,\@function"; last; }; @@ -37,7 +38,6 @@ my $globl = sub { $ret .= ".align 3\n"; $ret .= "$name:\n"; $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; - $ret .= ".size $name,24\n"; $ret .= ".previous\n"; $name = ".$name"; @@ -50,7 +50,9 @@ my $globl = sub { $ret; }; my $text = sub { - ($flavour =~ /aix/) ? ".csect" : ".text"; + my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; + $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); + $ret; }; my $machine = sub { my $junk = shift; @@ -62,9 +64,12 @@ my $machine = sub { ".machine $arch"; }; my $size = sub { - if ($flavour =~ /linux.*32/) + if ($flavour =~ /linux/) { shift; - ".size " . join(",",@_); + my $name = shift; $name =~ s|^[\.\_]||; + my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name; + $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/); + $ret; } else { ""; } @@ -77,6 +82,25 @@ my $asciz = sub { else { ""; } }; +my $quad = sub { + shift; + my @ret; + my ($hi,$lo); + for (@_) { + if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) + { $hi=$1?"0x$1":"0"; $lo="0x$2"; } + elsif (/^([0-9]+)$/o) + { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl + else + { $hi=undef; $lo=$_; } + + if (defined($hi)) + { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } + else + { push(@ret,".quad $lo"); } + } + join("\n",@ret); +}; ################################################################ # simplified mnemonics not handled by at least one assembler @@ -122,6 +146,66 @@ my $extrdi = sub { $b = ($b+$n)&63; $n = 64-$n; " rldicl $ra,$rs,$b,$n"; }; +my $vmr = sub { + my ($f,$vx,$vy) = @_; + " vor $vx,$vy,$vy"; +}; + +# Some ABIs specify vrsave, special-purpose register #256, as reserved +# for system use. +my $no_vrsave = ($flavour =~ /aix|linux64le/); +my $mtspr = sub { + my ($f,$idx,$ra) = @_; + if ($idx == 256 && $no_vrsave) { + " or $ra,$ra,$ra"; + } else { + " mtspr $idx,$ra"; + } +}; +my $mfspr = sub { + my ($f,$rd,$idx) = @_; + if ($idx == 256 && $no_vrsave) { + " li $rd,-1"; + } else { + " mfspr $rd,$idx"; + } +}; + +# PowerISA 2.06 stuff +sub vsxmem_op { + my ($f, $vrt, $ra, $rb, $op) = @_; + " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); +} +# made-up unaligned memory reference AltiVec/VMX instructions +my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x +my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x +my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx +my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx +my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x +my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x + +# PowerISA 2.07 stuff +sub vcrypto_op { + my ($f, $vrt, $vra, $vrb, $op) = @_; + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; +} +my $vcipher = sub { vcrypto_op(@_, 1288); }; +my $vcipherlast = sub { vcrypto_op(@_, 1289); }; +my $vncipher = sub { vcrypto_op(@_, 1352); }; +my $vncipherlast= sub { vcrypto_op(@_, 1353); }; +my $vsbox = sub { vcrypto_op(@_, 0, 1480); }; +my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; +my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; +my $vpmsumb = sub { vcrypto_op(@_, 1032); }; +my $vpmsumd = sub { vcrypto_op(@_, 1224); }; +my $vpmsubh = sub { vcrypto_op(@_, 1096); }; +my $vpmsumw = sub { vcrypto_op(@_, 1160); }; +my $vaddudm = sub { vcrypto_op(@_, 192); }; + +my $mtsle = sub { + my ($f, $arg) = @_; + " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); +}; while($line=<>) { @@ -138,7 +222,10 @@ while($line=<>) { { $line =~ s|(^[\.\w]+)\:\s*||; my $label = $1; - printf "%s:",($GLOBALS{$label} or $label) if ($label); + if ($label) { + printf "%s:",($GLOBALS{$label} or $label); + printf "\n.localentry\t$GLOBALS{$label},0" if ($GLOBALS{$label} && $flavour =~ /linux.*64le/); + } } { @@ -147,7 +234,7 @@ while($line=<>) { my $mnemonic = $2; my $f = $3; my $opcode = eval("\$$mnemonic"); - $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/); + $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } } diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl index d0b7ae27ae..263182b985 100644 --- a/crypto/perlasm/x86gas.pl +++ b/crypto/perlasm/x86gas.pl @@ -45,10 +45,8 @@ sub ::generic undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o); if ($#_==0) { &::emit($opcode); } - elsif ($opcode =~ m/^j/o && $#_==1) { &::emit($opcode,@arg); } - elsif ($opcode eq "call" && $#_==1) { &::emit($opcode,@arg); } - elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg); } - elsif ($opcode =~ m/^set/&& $#_==1) { &::emit($opcode,@arg); } + elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o) + { &::emit($opcode,@arg); } else { &::emit($opcode.$suffix,@arg);} 1; diff --git a/crypto/ppccap.c b/crypto/ppccap.c index ab89ccaa12..675630e41b 100644 --- a/crypto/ppccap.c +++ b/crypto/ppccap.c @@ -3,13 +3,24 @@ #include #include #include +#include +#if defined(__linux) || defined(_AIX) +# include +#endif +#if defined(_AIX53) /* defined even on post-5.3 */ +# include +# if !defined(__power_set) +# define __power_set(a) (_system_configuration.implementation & (a)) +# endif +#endif #include #include #define PPC_FPU64 (1<<0) #define PPC_ALTIVEC (1<<1) +#define PPC_CRYPTO207 (1<<2) -static int OPENSSL_ppccap_P = 0; +int OPENSSL_ppccap_P = 0; static sigset_t all_masked; @@ -49,10 +60,28 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U } #endif +void sha256_block_p8(void *ctx, const void *inp, size_t len); +void sha256_block_ppc(void *ctx, const void *inp, size_t len); +void sha256_block_data_order(void *ctx, const void *inp, size_t len) +{ + OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) : + sha256_block_ppc(ctx, inp, len); +} + +void sha512_block_p8(void *ctx, const void *inp, size_t len); +void sha512_block_ppc(void *ctx, const void *inp, size_t len); +void sha512_block_data_order(void *ctx, const void *inp, size_t len) +{ + OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) : + sha512_block_ppc(ctx, inp, len); +} + static sigjmp_buf ill_jmp; static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } void OPENSSL_ppc64_probe(void); +void OPENSSL_altivec_probe(void); +void OPENSSL_crypto207_probe(void); void OPENSSL_cpuid_setup(void) { @@ -82,6 +111,45 @@ void OPENSSL_cpuid_setup(void) OPENSSL_ppccap_P = 0; +#if defined(_AIX) + if (sizeof(size_t) == 4) { + struct utsname uts; +# if defined(_SC_AIX_KERNEL_BITMODE) + if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64) + return; +# endif + if (uname(&uts) != 0 || atoi(uts.version) < 6) + return; + } + +# if defined(__power_set) + /* + * Value used in __power_set is a single-bit 1<n, ret); - if (BN_cmp(ret, f)) + if (BN_cmp(ret, f) > 0) res = f; else res = ret; diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile index b1582f2cff..78c9315c31 100644 --- a/crypto/sha/Makefile +++ b/crypto/sha/Makefile @@ -73,6 +73,8 @@ sha512-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAG sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@ sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ +sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ +sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@ sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ @@ -90,6 +92,9 @@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ sha1-armv4-large.o: sha1-armv4-large.S sha256-armv4.o: sha256-armv4.S sha512-armv4.o: sha512-armv4.S +sha1-armv8.o: sha1-armv8.S +sha256-armv8.o: sha256-armv8.S +sha512-armv8.o: sha512-armv8.S files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index fe8207f77f..6c0adb9911 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -52,8 +52,20 @@ # Profiler-assisted and platform-specific optimization resulted in 10% # improvement on Cortex A8 core and 12.2 cycles per byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $inp="r1"; @@ -177,6 +189,7 @@ for($i=0;$i<5;$i++) { $code.=<<___; teq $Xi,sp bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 ___ &BODY_00_15(@V); unshift(@V,pop(@V)); &BODY_16_19(@V); unshift(@V,pop(@V)); @@ -186,7 +199,6 @@ ___ $code.=<<___; ldr $K,.LK_20_39 @ [+15+16*4] - sub sp,sp,#25*4 cmn sp,#0 @ [+3], clear carry to denote 20_39 .L_20_39_or_60_79: ___ diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl new file mode 100644 index 0000000000..6be8624342 --- /dev/null +++ b/crypto/sha/asm/sha1-armv8.pl @@ -0,0 +1,343 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# hardware-assisted software(*) +# Apple A7 2.31 4.13 (+14%) +# Cortex-A53 2.19 8.73 (+108%) +# Cortex-A57 2.35 7.88 (+74%) +# +# (*) Software results are presented mostly for reference purposes. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +($ctx,$inp,$num)=("x0","x1","x2"); +@Xw=map("w$_",(3..17,19)); +@Xx=map("x$_",(3..17,19)); +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); +($t0,$t1,$t2,$K)=map("w$_",(25..28)); + + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i<15 && !($i&1)); + lsr @Xx[$i+1],@Xx[$i],#32 +___ +$code.=<<___ if ($i<14 && !($i&1)); + ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`] +___ +$code.=<<___ if ($i<14 && ($i&1)); +#ifdef __ARMEB__ + ror @Xx[$i+1],@Xx[$i+1],#32 +#else + rev32 @Xx[$i+1],@Xx[$i+1] +#endif +___ +$code.=<<___ if ($i<14); + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==19); + movz $K,#0xeba1 + movk $K,#0x6ed9,lsl#16 +___ +$code.=<<___ if ($i>=14); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + bic $t0,$d,$b + and $t1,$c,$b + ror $t2,$a,#27 + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $d,$d,$K // future e+=K + orr $t0,$t0,$t1 + add $e,$e,$t2 // e+=rot(a,5) + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==59); + movz $K,#0xc1d6 + movk $K,#0xca62,lsl#16 +___ +$code.=<<___; + orr $t0,$b,$c + and $t1,$b,$c + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + ror $t2,$a,#27 + and $t0,$t0,$d + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + add $e,$e,$t2 // e+=rot(a,5) + orr $t0,$t0,$t1 + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=($i+2)&15; + +$code.=<<___ if ($i==39); + movz $K,#0xbcdc + movk $K,#0x8f1b,lsl#16 +___ +$code.=<<___ if ($i<78); + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) + ror @Xw[$j],@Xw[$j],#31 +___ +$code.=<<___ if ($i==78); + ldp @Xw[1],@Xw[2],[$ctx] + eor $t0,$d,$b + ror $t2,$a,#27 + add $d,$d,$K // future e+=K + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] + add $e,$e,$t0 // e+=F(b,c,d) +___ +$code.=<<___ if ($i==79); + ldp @Xw[3],@Xw[4],[$ctx,#8] + eor $t0,$d,$b + ror $t2,$a,#27 + eor $t0,$t0,$c + add $e,$e,$t2 // e+=rot(a,5) + ror $b,$b,#2 + ldr @Xw[5],[$ctx,#16] + add $e,$e,$t0 // e+=F(b,c,d) +___ +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.extern OPENSSL_armcap_P +.globl sha1_block_data_order +.type sha1_block_data_order,%function +.align 6 +sha1_block_data_order: + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA1 + b.ne .Lv8_entry + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldr $E,[$ctx,#16] + +.Loop: + ldr @Xx[0],[$inp],#64 + movz $K,#0x7999 + sub $num,$num,#1 + movk $K,#0x5a82,lsl#16 +#ifdef __ARMEB__ + ror $Xx[0],@Xx[0],#32 +#else + rev32 @Xx[0],@Xx[0] +#endif + add $E,$E,$K // warm it up + add $E,$E,@Xw[0] +___ +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + add $B,$B,@Xw[2] + add $C,$C,@Xw[3] + add $A,$A,@Xw[1] + add $D,$D,@Xw[4] + add $E,$E,@Xw[5] + stp $A,$B,[$ctx] + stp $C,$D,[$ctx,#8] + str $E,[$ctx,#16] + cbnz $num,.Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret +.size sha1_block_data_order,.-sha1_block_data_order +___ +{{{ +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); +my @MSG=map("v$_.16b",(4..7)); +my @Kxx=map("v$_.4s",(16..19)); +my ($W0,$W1)=("v20.4s","v21.4s"); +my $ABCD_SAVE="v22.16b"; + +$code.=<<___; +.type sha1_block_armv8,%function +.align 6 +sha1_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adr x4,.Lconst + eor $E,$E,$E + ld1.32 {$ABCD},[$ctx],#16 + ld1.32 {$E}[0],[$ctx] + sub $ctx,$ctx,#16 + ld1.32 {@Kxx[0]-@Kxx[3]},[x4] + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + + add.i32 $W0,@Kxx[0],@MSG[0] + rev32 @MSG[2],@MSG[2] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + + add.i32 $W1,@Kxx[0],@MSG[1] + rev32 @MSG[3],@MSG[3] + sha1h $E1,$ABCD + sha1c $ABCD,$E,$W0 // 0 + add.i32 $W0,@Kxx[$j],@MSG[2] + sha1su0 @MSG[0],@MSG[1],@MSG[2] +___ +for ($j=0,$i=1;$i<20-3;$i++) { +my $f=("c","p","m","p")[$i/5]; +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1$f $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + sha1su1 @MSG[0],@MSG[3] +___ +$code.=<<___ if ($i<20-4); + sha1su0 @MSG[1],@MSG[2],@MSG[3] +___ + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); +} +$code.=<<___; + sha1h $E0,$ABCD // $i + sha1p $ABCD,$E1,$W1 + add.i32 $W1,@Kxx[$j],@MSG[3] + + sha1h $E1,$ABCD // 18 + sha1p $ABCD,$E0,$W0 + + sha1h $E0,$ABCD // 19 + sha1p $ABCD,$E1,$W1 + + add.i32 $E,$E,$E0 + add.i32 $ABCD,$ABCD,$ABCD_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD},[$ctx],#16 + st1.32 {$E}[0],[$ctx] + + ldr x29,[sp],#16 + ret +.size sha1_block_armv8,.-sha1_block_armv8 +.align 6 +.Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.LOPENSSL_armcap_P: +.quad OPENSSL_armcap_P-. +.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by " +.align 2 +.comm OPENSSL_armcap_P,4,4 +___ +}}} + +{ my %opcode = ( + "sha1c" => 0x5e000000, "sha1p" => 0x5e001000, + "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000, + "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 ); + + sub unsha1 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/sha1-c64x-large.pl b/crypto/sha/asm/sha1-c64x-large.pl new file mode 100644 index 0000000000..3916ff3a3f --- /dev/null +++ b/crypto/sha/asm/sha1-c64x-large.pl @@ -0,0 +1,230 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x. +# +# November 2016 +# +# This is fully-unrolled SHA1 implementation. It's 25% faster than +# one with compact loops, doesn't use in-memory ring buffer, as +# everything is accomodated in registers, and has "perfect" interrupt +# agility. Drawback is obviously the code size... + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24)); +@V = ($A,$B,$C,$D,$E); +@X = map("B$_",(16..31)); +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e) = @_; +my $j = ($i+1)&15; + +$code.=<<___ if ($i<14); + ROTL $a,5,$Arot ;; $i +|| AND $c,$b,$F +|| ANDN $d,$b,$F0 +|| ADD $K,$e,$e ; E+=K +|| LDNW *${INP}++,@X[$i+2] + OR $F0,$F,$F ; F_00_19(B,C,D) +|| ROTL $b,30,$b +|| SWAP2 @X[$i+1],@X[$i+1] +|| ADD @X[$i],$e,$e ; E+=X[i] + ADD $Arot,$e,$e ; E+=rot(A,5) +|| SWAP4 @X[$i+1],@X[$i+1] + ADD $F,$e,$e ; E+=F_00_19(B,C,D) +___ +$code.=<<___ if ($i==14); + ROTL $a,5,$Arot ;; $i +|| AND $c,$b,$F +|| ANDN $d,$b,$F0 +|| ADD $K,$e,$e ; E+=K + OR $F0,$F,$F ; F_00_19(B,C,D) +|| ROTL $b,30,$b +|| ADD @X[$i],$e,$e ; E+=X[i] +|| SWAP2 @X[$i+1],@X[$i+1] + ADD $Arot,$e,$e ; E+=rot(A,5) +|| SWAP4 @X[$i+1],@X[$i+1] + ADD $F,$e,$e ; E+=F_00_19(B,C,D) +___ +$code.=<<___ if ($i==15); +|| XOR @X[($j+2)&15],@X[$j],@X[$j] + ROTL $a,5,$Arot ;; $i +|| AND $c,$b,$F +|| ANDN $d,$b,$F0 +|| ADD $K,$e,$e ; E+=K +|| XOR @X[($j+8)&15],@X[$j],@X[$j] + OR $F0,$F,$F ; F_00_19(B,C,D) +|| ROTL $b,30,$b +|| ADD @X[$i],$e,$e ; E+=X[i] +|| XOR @X[($j+13)&15],@X[$j],@X[$j] + ADD $Arot,$e,$e ; E+=rot(A,5) +|| ROTL @X[$j],1,@X[$j] + ADD $F,$e,$e ; E+=F_00_19(B,C,D) +___ +$code.=<<___ if ($i>15); +|| XOR @X[($j+2)&15],@X[$j],@X[$j] + ROTL $a,5,$Arot ;; $i +|| AND $c,$b,$F +|| ANDN $d,$b,$F0 +|| ADD $K,$e,$e ; E+=K +|| XOR @X[($j+8)&15],@X[$j],@X[$j] + OR $F0,$F,$F ; F_00_19(B,C,D) +|| ROTL $b,30,$b +|| ADD @X[$i&15],$e,$e ; E+=X[i] +|| XOR @X[($j+13)&15],@X[$j],@X[$j] + ADD $Arot,$e,$e ; E+=rot(A,5) +|| ROTL @X[$j],1,@X[$j] + ADD $F,$e,$e ; E+=F_00_19(B,C,D) +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e) = @_; +my $j = ($i+1)&15; + +$code.=<<___ if ($i<79); +|| XOR @X[($j+2)&15],@X[$j],@X[$j] + ROTL $a,5,$Arot ;; $i +|| XOR $c,$b,$F +|| ADD $K,$e,$e ; E+=K +|| XOR @X[($j+8)&15],@X[$j],@X[$j] + XOR $d,$F,$F ; F_20_39(B,C,D) +|| ROTL $b,30,$b +|| ADD @X[$i&15],$e,$e ; E+=X[i] +|| XOR @X[($j+13)&15],@X[$j],@X[$j] + ADD $Arot,$e,$e ; E+=rot(A,5) +|| ROTL @X[$j],1,@X[$j] + ADD $F,$e,$e ; E+=F_20_39(B,C,D) +___ +$code.=<<___ if ($i==79); +|| [A0] B loop? +|| [A0] LDNW *${INP}++,@X[0] ; pre-fetch input + ROTL $a,5,$Arot ;; $i +|| XOR $c,$b,$F +|| ADD $K,$e,$e ; E+=K +|| [A0] LDNW *${INP}++,@X[1] + XOR $d,$F,$F ; F_20_39(B,C,D) +|| ROTL $b,30,$b +|| ADD @X[$i&15],$e,$e ; E+=X[i] + ADD $Arot,$e,$e ; E+=rot(A,5) + ADD $F,$e,$e ; E+=F_20_39(B,C,D) +|| ADD $Bctx,$a,$a ; accumulate context +|| ADD $Cctx,$b,$b + ADD $Dctx,$c,$c +|| ADD $Ectx,$d,$d +|| ADD $Actx,$e,$e +;;===== branch to loop? is taken here +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e) = @_; +my $j = ($i+1)&15; + +$code.=<<___; +|| XOR @X[($j+2)&15],@X[$j],@X[$j] + ROTL $a,5,$Arot ;; $i +|| AND $c,$b,$F +|| AND $d,$b,$F0 +|| ADD $K,$e,$e ; E+=K +|| XOR @X[($j+8)&15],@X[$j],@X[$j] + XOR $F0,$F,$F +|| AND $c,$d,$F0 +|| ROTL $b,30,$b +|| XOR @X[($j+13)&15],@X[$j],@X[$j] +|| ADD @X[$i&15],$e,$e ; E+=X[i] + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| ADD $Arot,$e,$e ; E+=rot(A,5) +|| ROTL @X[$j],1,@X[$j] + ADD $F,$e,$e ; E+=F_20_39(B,C,D) +___ +} + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc + MV $NUM,A0 ; reassign $NUM + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] LDW *${CTX}[0],$A ; load A-E... + [A0] LDW *${CTX}[1],$B + [A0] LDW *${CTX}[2],$C + [A0] LDW *${CTX}[3],$D + [A0] LDW *${CTX}[4],$E + [A0] LDNW *${INP}++,@X[0] ; pre-fetch input + [A0] LDNW *${INP}++,@X[1] + NOP 3 + +loop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MVD $B,$Bctx +|| SWAP2 @X[0],@X[0] +|| MVKL 0x5a827999,$K + MVKH 0x5a827999,$K ; K_00_19 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MVD $E,$Ectx +|| SWAP4 @X[0],@X[0] +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +|| MVKL 0x6ed9eba1,$K + MVKH 0x6ed9eba1,$K ; K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +|| MVKL 0x8f1bbcdc,$K + MVKH 0x8f1bbcdc,$K ; K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +|| MVKL 0xca62c1d6,$K + MVKH 0xca62c1d6,$K ; K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + BNOP RA ; return + STW $A,*${CTX}[0] ; emit A-E... + STW $B,*${CTX}[1] + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-c64x.pl b/crypto/sha/asm/sha1-c64x.pl new file mode 100644 index 0000000000..d7a9dd1d05 --- /dev/null +++ b/crypto/sha/asm/sha1-c64x.pl @@ -0,0 +1,330 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x. +# +# November 2016 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Unlike its predecessor, sha1-c64xplus module, this module has worse +# interrupt agility. While original added up to 5 cycles delay to +# response to interrupt, this module adds up to 100. Fully unrolled +# implementation doesn't add any delay and even 25% faster, but is +# almost 5x larger... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVKL 0x5a827999,$K +|| ADDAW SP,2,$XPB +|| SUB A0,1,A0 + MVKH 0x5a827999,$K ; K_00_19 +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + B body_00_13? ; BODY_00_13 +|| MVK 11,B0 +|| MV $XPB,$XPA +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MVD $E,$Ectx + +body_00_13?: + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ +|| BDEC body_00_13?,B0 +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +;;================================================== +|| B body_16_19? ; BODY_16_19 +|| MVK 1,B0 + +body_16_19?: + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| BDEC body_16_19?,B0 + + MVKL 0x6ed9eba1,$K +|| MVK 17,B0 + MVKH 0x6ed9eba1,$K ; K_20_39 +___ +sub BODY_20_39 { +my $label = shift; +$code.=<<___; +;;================================================== +|| B $label ; BODY_20_39 + +$label: + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 +|| BDEC $label,B0 +___ +} &BODY_20_39("body_20_39?"); +$code.=<<___; +;;================================================== + MVKL 0x8f1bbcdc,$K +|| MVK 17,B0 + MVKH 0x8f1bbcdc,$K ; K_40_59 +|| B body_40_59? ; BODY_40_59 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + +body_40_59?: + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 +|| BDEC body_40_59?,B0 + + MVKL 0xca62c1d6,$K +|| MVK 16,B0 + MVKH 0xca62c1d6,$K ; K_60_79 +___ + &BODY_20_39("body_60_78?"); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl new file mode 100644 index 0000000000..87000d1e8f --- /dev/null +++ b/crypto/sha/asm/sha1-c64xplus.pl @@ -0,0 +1,323 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x+. +# +# November 2011 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Fully unrolled assembler would be ~5x larger and is likely to be +# ~15% faster. It would be free from references to intermediate ring +# buffer, but put more pressure on L1P [both because the code would be +# larger and won't be using SPLOOP buffer]. There are no plans to +# realize fully unrolled variant though... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVK 0x00007999,$K +|| ADDAW SP,2,$XPA +|| SUB A0,1,A0 +|| MVK 13,B0 + MVKH 0x5a820000,$K ; K_00_19 +|| ADDAW SP,2,$XPB +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + SPLOOPD 5 ; BODY_00_13 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MV $E,$Ectx +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ + SPKERNEL +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| MVK 3,B0 +;;================================================== + SPLOOPD 5 ; BODY_16_19 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 + SPKERNEL + + MVK 0xffffeba1,$K +|| MVK 19,B0 + MVKH 0x6ed90000,$K ; K_20_39 +___ +sub BODY_20_39 { +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_20_39 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 + SPKERNEL +___ +$code.=<<___ if (!shift); + MVK 0xffffbcdc,$K + MVKH 0x8f1b0000,$K ; K_40_59 +___ +} &BODY_20_39(); +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_40_59 +|| MVC B0,ILC +|| AND $B,$C,$F +|| AND $B,$D,$F0 + + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + SPKERNEL + + MVK 0xffffc1d6,$K +|| MVK 18,B0 + MVKH 0xca620000,$K ; K_60_79 +___ + &BODY_20_39(-1); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl index f1a702f38f..ca50e1b1ee 100644 --- a/crypto/sha/asm/sha1-mips.pl +++ b/crypto/sha/asm/sha1-mips.pl @@ -42,7 +42,7 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 @@ -64,7 +64,7 @@ if ($flavour =~ /64|n32/i) { # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0; for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl index 2140dd2f8d..7b86e2bc11 100755 --- a/crypto/sha/asm/sha1-ppc.pl +++ b/crypto/sha/asm/sha1-ppc.pl @@ -210,7 +210,7 @@ Lunaligned: srwi. $t1,$t1,6 ; t1/=64 beq Lcross_page $UCMP $num,$t1 - ble- Laligned ; didn't cross the page boundary + ble Laligned ; didn't cross the page boundary mtctr $t1 subfc $num,$t1,$num bl Lsha1_block_private @@ -238,7 +238,7 @@ Lmemcpy: bl Lsha1_block_private $POP $inp,`$FRAME-$SIZE_T*18`($sp) addic. $num,$num,-1 - bne- Lunaligned + bne Lunaligned Ldone: $POP r0,`$FRAME+$LRSAVE`($sp) @@ -312,7 +312,7 @@ $code.=<<___; stw r20,16($ctx) mr $E,r20 addi $inp,$inp,`16*4` - bdnz- Lsha1_block_private + bdnz Lsha1_block_private blr .long 0 .byte 0,12,0x14,0,0,0,0,0 diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index 9c84e8d93c..252a583d06 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -23,8 +23,20 @@ # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~17 cycles per processed byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $t0="r0"; $inp="r1"; $t3="r1"; diff --git a/crypto/sha/asm/sha256-c64x.pl b/crypto/sha/asm/sha256-c64x.pl new file mode 100644 index 0000000000..fbe99c0b7f --- /dev/null +++ b/crypto/sha/asm/sha256-c64x.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x. +# +# November 2016 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha256_block_data_order,_sha256_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: +__sha256_block: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC _sha256_block_data_order,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + .if __TI_EABI__ + [A0] MVK 0x00404,B1 +|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 + .else + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-__sha256_block),$K256 + .endif + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + NOP + ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + MVK 14,B0 ; loop counter +|| SWAP2 $X0,$X0 + +loop_00_14?: ; BODY_00_14 + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] +|| [B0] BDEC loop_00_14?,B0 + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +;;===== branch to loop00_14? is taken here + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + MVK 47,B1 ; loop counter +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + +loop_16_63?: ; BODY_16_63 + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) +|| [B1] BDEC loop_16_63?,B1 + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +;;===== branch to loop16_63? is taken here + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x, CRYPTOGAMS by " + .align 4 + +___ + +print $code; diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl new file mode 100644 index 0000000000..8b92c84555 --- /dev/null +++ b/crypto/sha/asm/sha256-c64xplus.pl @@ -0,0 +1,292 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x+. +# +# January 2012 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC _sha256_block_data_order,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-_sha256_block_data_order),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-_sha256_block_data_order),$K256 + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + MVK 14,B0 ; loop counters + MVK 47,B1 +|| ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + SPLOOPD 8 ; BODY_00_14 +|| MVC B0,ILC +|| SWAP2 $X0,$X0 + + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + SPKERNEL + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + SPLOOPD 10 ; BODY_16_63 +|| MVC B1,ILC +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled + SPKERNEL + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .sect ".const:sha_asm" + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by " + .align 4 + +___ + +print $code; diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index 7faf37b147..c032afdbca 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -38,8 +38,20 @@ $hi="HI"; $lo="LO"; # ==================================================================== -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; # parameter block $inp="r1"; @@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order +.word OPENSSL_armcap_P-.Lsha512_block_data_order .skip 32-4 .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: sub r3,pc,#8 @ sha512_block_data_order add $len,$inp,$len,lsl#7 @ len to point at the end of inp #if __ARM_ARCH__>=7 ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#1 bne .LNEON #endif diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl new file mode 100644 index 0000000000..45eb719fe5 --- /dev/null +++ b/crypto/sha/asm/sha512-armv8.pl @@ -0,0 +1,428 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significanty faster +# and lags behind assembly only by 50-90%. + +$flavour=shift; +$output=shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __ARMEB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#include "arm_arch.h" + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); + ldr x16,.LOPENSSL_armcap_P + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +.align 3 +.LOPENSSL_armcap_P: + .quad OPENSSL_armcap_P-. +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +___ +} + +$code.=<<___; +.comm OPENSSL_armcap_P,4,4 +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/geo; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; + + s/\.\w?32\b//o and s/\.16b/\.4s/go; + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/sha512-c64x.pl b/crypto/sha/asm/sha512-c64x.pl new file mode 100644 index 0000000000..e35a72ade5 --- /dev/null +++ b/crypto/sha/asm/sha512-c64x.pl @@ -0,0 +1,437 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x. +# +# November 2016 +# +# Performance is ~19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha512_block_data_order,_sha512_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: +__sha512_block: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + .if __TI_EABI__ + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512 + [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .else + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL (K512-__sha512_block),$K512 + [A0] MVKH (K512-__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .endif + ADDAW SP,3,$Xilo + ADD SP,4*2,$Xihi ; ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif + STW $T1hi,*$Xihi++[2] ; original loop16_79? +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo +loop16_79?: + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + [B0] BNOP loop0_15? +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| ROTL $T1lo,0,$Elo ; e = T1, "ghost" value +||[!B1] BNOP break? + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + [B1] LDW *${Xihi}[2],$T2hi +|| [B1] LDW *${Xilo}[2],$T2lo ; X[i+1] +|| [B1] SHRU $T1hi,19,$S1hi +|| [B1] SHL $T1hi,32-19,$S1lo + [B1] SHRU $T1lo,19,$t0lo +|| [B1] SHL $T1lo,32-19,$t0hi +;;===== branch to break? is taken here + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo +|| [B1] BNOP loop16_79? + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi + + STW $T1hi,*$Xihi++[2] ; copied "top" bundle +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl new file mode 100644 index 0000000000..56c8583bf3 --- /dev/null +++ b/crypto/sha/asm/sha512-c64xplus.pl @@ -0,0 +1,410 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x+. +# +# January 2012 +# +# Performance is 19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC _sha512_block_data_order,B1 +|| [A0] MVKL (K512-_sha512_block_data_order),$K512 + [A0] MVKH (K512-_sha512_block_data_order),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + ADDAW SP,3,$Xilo + ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif +loop16_79?: + STW $T1hi,*$Xihi++[2] +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| [B0] BNOP loop0_15? +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input +||[!B1] BNOP break? +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| MV $T1lo,$Elo ; e = T1 +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + NOP +;;===== branch to break? is taken here + LDW *${Xihi}[2],$T2hi +|| LDW *${Xilo}[2],$T2lo ; X[i+1] +|| SHRU $T1hi,19,$S1hi +|| SHL $T1hi,32-19,$S1lo + SHRU $T1lo,19,$t0lo +|| SHL $T1lo,32-19,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| [B1] BNOP loop16_79? +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .sect ".const:sha_asm" + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl index ba5b250890..00e795b0ad 100644 --- a/crypto/sha/asm/sha512-mips.pl +++ b/crypto/sha/asm/sha512-mips.pl @@ -45,7 +45,7 @@ # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # -$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 +$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { $PTR_ADD="dadd"; # incidentally works even on n32 @@ -68,7 +68,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; +$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0; for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; @@ -244,7 +244,7 @@ $code.=<<___; .text .set noat -#if !defined(__vxworks) || defined(__pic__) +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) .option pic2 #endif @@ -351,7 +351,7 @@ $code.=<<___; $ST $G,6*$SZ($ctx) $ST $H,7*$SZ($ctx) - bnel $inp,@X[15],.Loop + bne $inp,@X[15],.Loop $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl index 6b44a68e59..4051119e5d 100755 --- a/crypto/sha/asm/sha512-ppc.pl +++ b/crypto/sha/asm/sha512-ppc.pl @@ -64,7 +64,7 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; if ($output =~ /512/) { - $func="sha512_block_data_order"; + $func="sha512_block_ppc"; $SZ=8; @Sigma0=(28,34,39); @Sigma1=(14,18,41); @@ -76,7 +76,7 @@ if ($output =~ /512/) { $ROR="rotrdi"; $SHR="srdi"; } else { - $func="sha256_block_data_order"; + $func="sha256_block_ppc"; $SZ=4; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @@ -243,7 +243,7 @@ Lunaligned: andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary beq Lcross_page $UCMP $num,$t1 - ble- Laligned ; didn't cross the page boundary + ble Laligned ; didn't cross the page boundary subfc $num,$t1,$num add $t1,$inp,$t1 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num @@ -279,7 +279,7 @@ Lmemcpy: $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num addic. $num,$num,`-16*$SZ` ; num-- - bne- Lunaligned + bne Lunaligned Ldone: $POP r0,`$FRAME+$LRSAVE`($sp) @@ -339,7 +339,7 @@ for(;$i<32;$i++) { unshift(@V,pop(@V)); } $code.=<<___; - bdnz- Lrounds + bdnz Lrounds $POP $ctx,`$FRAME-$SIZE_T*22`($sp) $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl new file mode 100755 index 0000000000..038292055c --- /dev/null +++ b/crypto/sha/asm/sha512p8-ppc.pl @@ -0,0 +1,431 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 for PowerISA v2.07. +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This module is +# ~60% faster than integer-only sha512-ppc.pl. To anchor to something +# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than +# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than +# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting +# result is degree of computational resources' utilization. POWER8 is +# "massively multi-threaded chip" and difference between single- and +# maximum multi-process benchmark results tells that utlization is +# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and +# for sha1-ppc.pl - 73%. 100% means that multi-process result equals +# to single-process one, given that all threads end up on the same +# physical core. +# +####################################################################### +# +# SHA256/pre-2.07(*) SHA512/pre-2.07(*) SHA1(*) +# POWER8 9.3 /14.8 5.8 /9.5 7.1 +# +# (*) presented for reference/comparison purposes; + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; +} else { die "nonsense $flavour"; } + +$LENDIAN=($flavour=~/le/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; + +if ($output =~ /512/) { + $bits=512; + $SZ=8; + $sz="d"; + $rounds=80; +} else { + $bits=256; + $SZ=4; + $sz="w"; + $rounds=64; +} + +$func="sha${bits}_block_p8"; +$FRAME=8*$SIZE_T; + +$sp ="r1"; +$toc="r2"; +$ctx="r3"; +$inp="r4"; +$num="r5"; +$Tbl="r6"; +$idx="r7"; +$lrsave="r8"; +$offload="r11"; +$vrsave="r12"; +($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); + $x00=0 if ($flavour =~ /osx/); + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); +@X=map("v$_",(8..23)); +($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); + +sub ROUND { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)%16; + +$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); + lvx_u @X[$i+1],0,$inp ; load X[i] in advance + addi $inp,$inp,16 +___ +$code.=<<___ if ($i<16 && ($i%(16/$SZ))); + vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ +___ +$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); + vperm @X[$i],@X[$i],@X[$i],$lemask +___ +$code.=<<___; + `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` + vsel $Func,$g,$f,$e ; Ch(e,f,g) + vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) + vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] + vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) + `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` + vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) + vxor $Func,$a,$b + `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` + vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) + vsel $Func,$b,$c,$Func ; Maj(a,b,c) + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] + vaddu${sz}m $d,$d,$h ; d+=h + vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) + `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` + lvx $Ki,$idx,$Tbl ; load next K[i] + addi $idx,$idx,16 + vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) + `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` +___ +} + +$code=<<___; +.machine "any" +.text + +.globl $func +.align 6 +$func: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr $lrsave + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + mfspr $vrsave,256 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r11,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + mtspr 256,r11 + + bl LPICmeup + addi $offload,$sp,$FRAME+15 +___ +$code.=<<___ if ($LENDIAN); + li $idx,8 + lvsl $lemask,0,$idx + vspltisb $Ki,0x0f + vxor $lemask,$lemask,$Ki +___ +$code.=<<___ if ($SZ==4); + lvx_4w $A,$x00,$ctx + lvx_4w $E,$x10,$ctx + vsldoi $B,$A,$A,4 # unpack + vsldoi $C,$A,$A,8 + vsldoi $D,$A,$A,12 + vsldoi $F,$E,$E,4 + vsldoi $G,$E,$E,8 + vsldoi $H,$E,$E,12 +___ +$code.=<<___ if ($SZ==8); + lvx_u $A,$x00,$ctx + lvx_u $C,$x10,$ctx + lvx_u $E,$x20,$ctx + vsldoi $B,$A,$A,8 # unpack + lvx_u $G,$x30,$ctx + vsldoi $D,$C,$C,8 + vsldoi $F,$E,$E,8 + vsldoi $H,$G,$G,8 +___ +$code.=<<___; + li r0,`($rounds-16)/16` # inner loop counter + b Loop +.align 5 +Loop: + lvx $Ki,$x00,$Tbl + li $idx,16 + lvx_u @X[0],0,$inp + addi $inp,$inp,16 + stvx $A,$x00,$offload # offload $A-$H + stvx $B,$x10,$offload + stvx $C,$x20,$offload + stvx $D,$x30,$offload + stvx $E,$x40,$offload + stvx $F,$x50,$offload + stvx $G,$x60,$offload + stvx $H,$x70,$offload + vaddu${sz}m $H,$H,$Ki # h+K[i] + lvx $Ki,$idx,$Tbl + addi $idx,$idx,16 +___ +for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mtctr r0 + b L16_xx +.align 5 +L16_xx: +___ +for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bdnz L16_xx + + lvx @X[2],$x00,$offload + subic. $num,$num,1 + lvx @X[3],$x10,$offload + vaddu${sz}m $A,$A,@X[2] + lvx @X[4],$x20,$offload + vaddu${sz}m $B,$B,@X[3] + lvx @X[5],$x30,$offload + vaddu${sz}m $C,$C,@X[4] + lvx @X[6],$x40,$offload + vaddu${sz}m $D,$D,@X[5] + lvx @X[7],$x50,$offload + vaddu${sz}m $E,$E,@X[6] + lvx @X[8],$x60,$offload + vaddu${sz}m $F,$F,@X[7] + lvx @X[9],$x70,$offload + vaddu${sz}m $G,$G,@X[8] + vaddu${sz}m $H,$H,@X[9] + bne Loop +___ +$code.=<<___ if ($SZ==4); + lvx @X[0],$idx,$Tbl + addi $idx,$idx,16 + vperm $A,$A,$B,$Ki # pack the answer + lvx @X[1],$idx,$Tbl + vperm $E,$E,$F,$Ki + vperm $A,$A,$C,@X[0] + vperm $E,$E,$G,@X[0] + vperm $A,$A,$D,@X[1] + vperm $E,$E,$H,@X[1] + stvx_4w $A,$x00,$ctx + stvx_4w $E,$x10,$ctx +___ +$code.=<<___ if ($SZ==8); + vperm $A,$A,$B,$Ki # pack the answer + vperm $C,$C,$D,$Ki + vperm $E,$E,$F,$Ki + vperm $G,$G,$H,$Ki + stvx_u $A,$x00,$ctx + stvx_u $C,$x10,$ctx + stvx_u $E,$x20,$ctx + stvx_u $G,$x30,$ctx +___ +$code.=<<___; + li r10,`$FRAME+8*16+15` + mtlr $lrsave + li r11,`$FRAME+8*16+31` + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,4,1,0x80,6,3,0 + .long 0 +.size $func,.-$func +___ + +# Ugly hack here, because PPC assembler syntax seem to vary too +# much from platforms to platform... +$code.=<<___; +.align 6 +LPICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` +___ + +if ($SZ==8) { + local *table = sub { + foreach(@_) { $code.=".quad $_,$_\n"; } + }; + table( + "0x428a2f98d728ae22","0x7137449123ef65cd", + "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", + "0x3956c25bf348b538","0x59f111f1b605d019", + "0x923f82a4af194f9b","0xab1c5ed5da6d8118", + "0xd807aa98a3030242","0x12835b0145706fbe", + "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", + "0x72be5d74f27b896f","0x80deb1fe3b1696b1", + "0x9bdc06a725c71235","0xc19bf174cf692694", + "0xe49b69c19ef14ad2","0xefbe4786384f25e3", + "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", + "0x2de92c6f592b0275","0x4a7484aa6ea6e483", + "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", + "0x983e5152ee66dfab","0xa831c66d2db43210", + "0xb00327c898fb213f","0xbf597fc7beef0ee4", + "0xc6e00bf33da88fc2","0xd5a79147930aa725", + "0x06ca6351e003826f","0x142929670a0e6e70", + "0x27b70a8546d22ffc","0x2e1b21385c26c926", + "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", + "0x650a73548baf63de","0x766a0abb3c77b2a8", + "0x81c2c92e47edaee6","0x92722c851482353b", + "0xa2bfe8a14cf10364","0xa81a664bbc423001", + "0xc24b8b70d0f89791","0xc76c51a30654be30", + "0xd192e819d6ef5218","0xd69906245565a910", + "0xf40e35855771202a","0x106aa07032bbd1b8", + "0x19a4c116b8d2d0c8","0x1e376c085141ab53", + "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", + "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", + "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", + "0x748f82ee5defb2fc","0x78a5636f43172f60", + "0x84c87814a1f0ab72","0x8cc702081a6439ec", + "0x90befffa23631e28","0xa4506cebde82bde9", + "0xbef9a3f7b2c67915","0xc67178f2e372532b", + "0xca273eceea26619c","0xd186b8c721c0c207", + "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", + "0x06f067aa72176fba","0x0a637dc5a2c898a6", + "0x113f9804bef90dae","0x1b710b35131c471b", + "0x28db77f523047d84","0x32caab7b40c72493", + "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", + "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", + "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); +$code.=<<___ if (!$LENDIAN); +.quad 0x0001020304050607,0x1011121314151617 +___ +$code.=<<___ if ($LENDIAN); # quad-swapped +.quad 0x1011121314151617,0x0001020304050607 +___ +} else { + local *table = sub { + foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } + }; + table( + "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", + "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", + "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", + "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", + "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", + "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", + "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", + "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", + "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", + "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", + "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", + "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", + "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", + "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", + "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", + "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); +$code.=<<___ if (!$LENDIAN); +.long 0x00010203,0x10111213,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x08090a0b,0x10111213 +___ +$code.=<<___ if ($LENDIAN); # word-swapped +.long 0x10111213,0x10111213,0x10111213,0x00010203 +.long 0x10111213,0x10111213,0x04050607,0x00010203 +.long 0x10111213,0x08090a0b,0x04050607,0x00010203 +___ +} +$code.=<<___; +.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/uid.c b/crypto/uid.c index b1fd52bada..a1261731c9 100644 --- a/crypto/uid.c +++ b/crypto/uid.c @@ -65,7 +65,7 @@ int OPENSSL_issetugid(void) return issetugid(); } -#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE) +#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE) || defined(_TMS320C6X) int OPENSSL_issetugid(void) { diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl index 6595ff35fc..e8eaef7582 100644 --- a/crypto/x86cpuid.pl +++ b/crypto/x86cpuid.pl @@ -119,10 +119,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &mov ("esi","edx"); &or ("ebp","ecx"); # merge AMD XOP flag - &bt ("ecx",26); # check XSAVE bit - &jnc (&label("done")); &bt ("ecx",27); # check OSXSAVE bit - &jnc (&label("clear_xmm")); + &jnc (&label("clear_avx")); &xor ("ecx","ecx"); &data_byte(0x0f,0x01,0xd0); # xgetbv &and ("eax",6); diff --git a/doc/crypto/SSLeay_version.pod b/doc/crypto/SSLeay_version.pod deleted file mode 100644 index 1500c2af91..0000000000 --- a/doc/crypto/SSLeay_version.pod +++ /dev/null @@ -1,74 +0,0 @@ -=pod - -=head1 NAME - -SSLeay_version - retrieve version/build information about OpenSSL library - -=head1 SYNOPSIS - - #include - - const char *SSLeay_version(int type); - -=head1 DESCRIPTION - -SSLeay_version() returns a pointer to a constant string describing the -version of the OpenSSL library or giving information about the library -build. - -The following B values are supported: - -=over 4 - -=item SSLEAY_VERSION - -The version of the OpenSSL library including the release date. - -=item SSLEAY_CFLAGS - -The compiler flags set for the compilation process in the form -"compiler: ..." if available or "compiler: information not available" -otherwise. - -=item SSLEAY_BUILT_ON - -The date of the build process in the form "built on: ..." if available -or "built on: date not available" otherwise. - -=item SSLEAY_PLATFORM - -The "Configure" target of the library build in the form "platform: ..." -if available or "platform: information not available" otherwise. - -=item SSLEAY_DIR - -The "OPENSSLDIR" setting of the library build in the form "OPENSSLDIR: "..."" -if available or "OPENSSLDIR: N/A" otherwise. - -=back - -=head1 RETURN VALUES - -The following return values can occur: - -=over 4 - -=item "not available" - -An invalid value for B was given. - -=item Pointer to constant string - -Textual description. - -=back - -=head1 SEE ALSO - -L - -=head1 HISTORY - -B was added in OpenSSL 0.9.7. - -=cut diff --git a/e_os.h b/e_os.h index 79c1392573..efe58fb97e 100644 --- a/e_os.h +++ b/e_os.h @@ -306,7 +306,7 @@ static unsigned int _strlen31(const char *str) # undef isupper # undef isxdigit # endif -# if defined(_MSC_VER) && !defined(_DLL) && defined(stdin) +# if defined(_MSC_VER) && !defined(_WIN32_WCE) && !defined(_DLL) && defined(stdin) # if _MSC_VER>=1300 # undef stdin # undef stdout @@ -332,8 +332,10 @@ static unsigned int _strlen31(const char *str) # endif # endif # endif -# include -# include +# if !defined(OPENSSL_FIPSCANISTER) +# include +# include +# endif # ifdef OPENSSL_SYS_WINCE # define OPENSSL_NO_POSIX_IO @@ -668,7 +670,7 @@ extern char *sys_errlist[]; extern int sys_nerr; #if defined(OPENSSL_SYS_WINDOWS) # define strcasecmp _stricmp # define strncasecmp _strnicmp -#elif defined(OPENSSL_SYS_VMS) +#elif defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_DSPBIOS) /* VMS below version 7.0 doesn't have strcasecmp() */ # include "o_str.h" # define strcasecmp OPENSSL_strcasecmp diff --git a/fips/aes/fips_aesavs.c b/fips/aes/fips_aesavs.c index 84bcbac32a..cc3ed6afb1 100644 --- a/fips/aes/fips_aesavs.c +++ b/fips/aes/fips_aesavs.c @@ -99,7 +99,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, { const EVP_CIPHER *cipher = NULL; - if (strcasecmp(amode, "CBC") == 0) + if (fips_strcasecmp(amode, "CBC") == 0) { switch (akeysz) { @@ -117,7 +117,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, } } - else if (strcasecmp(amode, "ECB") == 0) + else if (fips_strcasecmp(amode, "ECB") == 0) { switch (akeysz) { @@ -134,7 +134,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, break; } } - else if (strcasecmp(amode, "CFB128") == 0) + else if (fips_strcasecmp(amode, "CFB128") == 0) { switch (akeysz) { @@ -169,7 +169,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, break; } } - else if(!strcasecmp(amode,"CFB1")) + else if(!fips_strcasecmp(amode,"CFB1")) { switch (akeysz) { @@ -186,7 +186,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, break; } } - else if(!strcasecmp(amode,"CFB8")) + else if(!fips_strcasecmp(amode,"CFB8")) { switch (akeysz) { @@ -215,7 +215,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx, } if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0) return 0; - if(!strcasecmp(amode,"CFB1")) + if(!fips_strcasecmp(amode,"CFB1")) M_EVP_CIPHER_CTX_set_flags(ctx, EVP_CIPH_FLAG_LENGTH_BITS); if (dir) FIPS_cipher(ctx, ciphertext, plaintext, len); @@ -535,7 +535,7 @@ static int do_mct(char *amode, } } } - + FIPS_cipher_ctx_cleanup(&ctx); return ret; } @@ -554,7 +554,7 @@ static int proc_file(char *rqfile, char *rspfile) FILE *afp = NULL, *rfp = NULL; char ibuf[2048]; char tbuf[2048]; - int ilen, len, ret = 0; + int len; char algo[8] = ""; char amode[8] = ""; char atest[8] = ""; @@ -605,7 +605,6 @@ static int proc_file(char *rqfile, char *rspfile) while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL) { tidy_line(tbuf, ibuf); - ilen = strlen(ibuf); /* printf("step=%d ibuf=%s",step,ibuf); */ switch (step) { @@ -636,10 +635,8 @@ static int proc_file(char *rqfile, char *rspfile) char *xp, *pp = ibuf+2; int n; if (akeysz) - { /* insert current time & date */ - time_t rtim = time(0); - fputs("# ", rfp); - copy_line(ctime(&rtim), rfp); + { + copy_line(ibuf, rfp); } else { @@ -780,11 +777,11 @@ static int proc_file(char *rqfile, char *rspfile) if(do_mct(amode, akeysz, aKey, iVec, dir, (unsigned char*)plaintext, len, rfp) < 0) - EXIT(1); + err = 1; } else { - ret = AESTest(&ctx, amode, akeysz, aKey, iVec, + AESTest(&ctx, amode, akeysz, aKey, iVec, dir, /* 0 = decrypt, 1 = encrypt */ plaintext, ciphertext, len); OutputValue("CIPHERTEXT",ciphertext,len,rfp, @@ -822,7 +819,7 @@ static int proc_file(char *rqfile, char *rspfile) } else { - ret = AESTest(&ctx, amode, akeysz, aKey, iVec, + AESTest(&ctx, amode, akeysz, aKey, iVec, dir, /* 0 = decrypt, 1 = encrypt */ plaintext, ciphertext, len); OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp, @@ -850,6 +847,7 @@ static int proc_file(char *rqfile, char *rspfile) fclose(rfp); if (afp) fclose(afp); + FIPS_cipher_ctx_cleanup(&ctx); return err; } @@ -862,23 +860,26 @@ static int proc_file(char *rqfile, char *rspfile) aes_test -d xxxxx.xxx The default is: -d req.txt --------------------------------------------------*/ +#ifdef FIPS_ALGVS +int fips_aesavs_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { char *rqlist = "req.txt", *rspfile = NULL; FILE *fp = NULL; char fn[250] = "", rfn[256] = ""; - int f_opt = 0, d_opt = 1; + int d_opt = 1; fips_algtest_init(); if (argc > 1) { - if (strcasecmp(argv[1], "-d") == 0) + if (fips_strcasecmp(argv[1], "-d") == 0) { d_opt = 1; } - else if (strcasecmp(argv[1], "-f") == 0) + else if (fips_strcasecmp(argv[1], "-f") == 0) { - f_opt = 1; d_opt = 0; } else @@ -915,7 +916,7 @@ int main(int argc, char **argv) if (proc_file(rfn, rspfile)) { printf(">>> Processing failed for: %s <<<\n", rfn); - EXIT(1); + return 1; } } fclose(fp); @@ -929,7 +930,6 @@ int main(int argc, char **argv) printf(">>> Processing failed for: %s <<<\n", fn); } } - EXIT(0); return 0; } diff --git a/fips/aes/fips_gcmtest.c b/fips/aes/fips_gcmtest.c index 3839de8f8a..30e4bcc0f4 100644 --- a/fips/aes/fips_gcmtest.c +++ b/fips/aes/fips_gcmtest.c @@ -75,10 +75,11 @@ int main(int argc, char **argv) #include "fips_utl.h" +static char buf[204800]; +static char lbuf[204800]; + static void gcmtest(FILE *in, FILE *out, int encrypt) { - char buf[2048]; - char lbuf[2048]; char *keyword, *value; int keylen = -1, ivlen = -1, aadlen = -1, taglen = -1, ptlen = -1; int rv; @@ -261,16 +262,14 @@ static void gcmtest(FILE *in, FILE *out, int encrypt) iv = aad = ct = pt = key = tag = NULL; } } + FIPS_cipher_ctx_cleanup(&ctx); } static void xtstest(FILE *in, FILE *out) { - char buf[204800]; - char lbuf[204800]; char *keyword, *value; int inlen = 0; int encrypt = 0; - int rv; long l; unsigned char *key = NULL, *iv = NULL; unsigned char *inbuf = NULL, *outbuf = NULL; @@ -326,7 +325,7 @@ static void xtstest(FILE *in, FILE *out) { FIPS_cipherinit(&ctx, xts, key, iv, encrypt); outbuf = OPENSSL_malloc(inlen); - rv = FIPS_cipher(&ctx, outbuf, inbuf, inlen); + FIPS_cipher(&ctx, outbuf, inbuf, inlen); OutputValue(encrypt ? "CT":"PT", outbuf, inlen, out, 0); OPENSSL_free(inbuf); OPENSSL_free(outbuf); @@ -335,12 +334,11 @@ static void xtstest(FILE *in, FILE *out) iv = key = inbuf = outbuf = NULL; } } + FIPS_cipher_ctx_cleanup(&ctx); } static void ccmtest(FILE *in, FILE *out) { - char buf[200048]; - char lbuf[200048]; char *keyword, *value; long l; unsigned char *Key = NULL, *Nonce = NULL; @@ -428,6 +426,8 @@ static void ccmtest(FILE *in, FILE *out) } else if (!strcmp(keyword,"Adata")) { + if (Adata) + OPENSSL_free(Adata); Adata = hex2bin_m(value, &l); if (Alen && l != Alen) { @@ -493,10 +493,16 @@ static void ccmtest(FILE *in, FILE *out) OPENSSL_free(Key); if (Nonce) OPENSSL_free(Nonce); + if (Adata) + OPENSSL_free(Adata); FIPS_cipher_ctx_cleanup(&ctx); } -int main(int argc,char **argv) +#ifdef FIPS_ALGVS +int fips_gcmtest_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { int encrypt; int xts = 0, ccm = 0; diff --git a/fips/cmac/fips_cmactest.c b/fips/cmac/fips_cmactest.c index 6d799f2d5f..2c8c7664e9 100644 --- a/fips/cmac/fips_cmactest.c +++ b/fips/cmac/fips_cmactest.c @@ -92,7 +92,11 @@ static int print_cmac_ver(const EVP_CIPHER *cipher, FILE *out, unsigned char *Mac, int Maclen, int Tlen); +#ifdef FIPS_ALGVS +int fips_cmactest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; int mode = 0; /* 0 => Generate, 1 => Verify */ diff --git a/fips/des/fips_des_selftest.c b/fips/des/fips_des_selftest.c index a014f6f33f..fdf1eb6945 100644 --- a/fips/des/fips_des_selftest.c +++ b/fips/des/fips_des_selftest.c @@ -83,7 +83,7 @@ static const struct int FIPS_selftest_des() { - int n, ret = 0; + int n, ret = 1; EVP_CIPHER_CTX ctx; FIPS_cipher_ctx_init(&ctx); @@ -93,10 +93,8 @@ int FIPS_selftest_des() if (!fips_cipher_test(FIPS_TEST_CIPHER, &ctx, EVP_des_ede3_ecb(), tests3[n].key, NULL, tests3[n].plaintext, tests3[n].ciphertext, 8)) - goto err; + ret = 0; } - ret = 1; - err: FIPS_cipher_ctx_cleanup(&ctx); if (ret == 0) FIPSerr(FIPS_F_FIPS_SELFTEST_DES,FIPS_R_SELFTEST_FAILED); diff --git a/fips/des/fips_desmovs.c b/fips/des/fips_desmovs.c index e8766561ce..0ffab89e2f 100644 --- a/fips/des/fips_desmovs.c +++ b/fips/des/fips_desmovs.c @@ -102,7 +102,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx, if (akeysz != 192) { printf("Invalid key size: %d\n", akeysz); - EXIT(1); + return 0; } if (fips_strcasecmp(amode, "CBC") == 0) @@ -120,7 +120,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx, else { printf("Unknown mode: %s\n", amode); - EXIT(1); + return 0; } if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0) @@ -155,12 +155,12 @@ static void shiftin(unsigned char *dst,unsigned char *src,int nbits) } /*-----------------------------------------------*/ -char *t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"}; -char *t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"}; -enum Mode {CBC, ECB, OFB, CFB1, CFB8, CFB64}; +char *tdes_t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"}; +char *tdes_t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"}; +enum tdes_Mode {TCBC, TECB, TOFB, TCFB1, TCFB8, TCFB64}; int Sizes[6]={64,64,64,1,8,64}; -static void do_mct(char *amode, +static int do_tmct(char *amode, int akeysz, int numkeys, unsigned char *akey,unsigned char *ivec, int dir, unsigned char *text, int len, FILE *rfp) @@ -170,12 +170,12 @@ static void do_mct(char *amode, unsigned char text0[8]; for (imode=0 ; imode < 6 ; ++imode) - if(!strcmp(amode,t_mode[imode])) + if(!strcmp(amode,tdes_t_mode[imode])) break; if (imode == 6) { printf("Unrecognized mode: %s\n", amode); - EXIT(1); + return 0; } for(i=0 ; i < 400 ; ++i) { @@ -196,12 +196,12 @@ static void do_mct(char *amode, OutputValue("",akey+n*8,8,rfp,0); } - if(imode != ECB) + if(imode != TECB) OutputValue("IV",ivec,8,rfp,0); - OutputValue(t_tag[dir^1],text,len,rfp,imode == CFB1); + OutputValue(tdes_t_tag[dir^1],text,len,rfp,imode == TCFB1); #if 0 /* compensate for endianness */ - if(imode == CFB1) + if(imode == TCFB1) text[0]<<=7; #endif memcpy(text0,text,8); @@ -223,18 +223,18 @@ static void do_mct(char *amode, } if(j == 9999) { - OutputValue(t_tag[dir],text,len,rfp,imode == CFB1); + OutputValue(tdes_t_tag[dir],text,len,rfp,imode == TCFB1); /* memcpy(ivec,text,8); */ } /* DebugValue("iv",ctx.iv,8); */ /* accumulate material for the next key */ shiftin(nk,text,Sizes[imode]); /* DebugValue("nk",nk,24);*/ - if((dir && (imode == CFB1 || imode == CFB8 || imode == CFB64 - || imode == CBC)) || imode == OFB) + if((dir && (imode == TCFB1 || imode == TCFB8 + || imode == TCFB64 || imode == TCBC)) || imode == TOFB) memcpy(text,old_iv,8); - if(!dir && (imode == CFB1 || imode == CFB8 || imode == CFB64)) + if(!dir && (imode == TCFB1 || imode == TCFB8 || imode == TCFB64)) { /* the test specifies using the output of the raw DES operation which we don't have, so reconstruct it... */ @@ -260,18 +260,20 @@ static void do_mct(char *amode, /* pointless exercise - the final text doesn't depend on the initial text in OFB mode, so who cares what it is? (Who designed these tests?) */ - if(imode == OFB) + if(imode == TOFB) for(n=0 ; n < 8 ; ++n) text[n]=text0[n]^old_iv[n]; + FIPS_cipher_ctx_cleanup(&ctx); } + return 1; } -static int proc_file(char *rqfile, char *rspfile) +static int tproc_file(char *rqfile, char *rspfile) { char afn[256], rfn[256]; FILE *afp = NULL, *rfp = NULL; char ibuf[2048], tbuf[2048]; - int ilen, len, ret = 0; + int len; char amode[8] = ""; char atest[100] = ""; int akeysz=0; @@ -322,7 +324,6 @@ static int proc_file(char *rqfile, char *rspfile) while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL) { tidy_line(tbuf, ibuf); - ilen = strlen(ibuf); /* printf("step=%d ibuf=%s",step,ibuf);*/ if(step == 3 && !strcmp(amode,"ECB")) { @@ -355,10 +356,8 @@ static int proc_file(char *rqfile, char *rspfile) char *xp, *pp = ibuf+2; int n; if(*amode) - { /* insert current time & date */ - time_t rtim = time(0); - fputs("# ", rfp); - copy_line(ctime(&rtim), rfp); + { + copy_line(ibuf, rfp); } else { @@ -546,12 +545,14 @@ static int proc_file(char *rqfile, char *rspfile) PrintValue("PLAINTEXT", (unsigned char*)plaintext, len); if (strcmp(atest, "Monte") == 0) /* Monte Carlo Test */ { - do_mct(amode,akeysz,numkeys,aKey,iVec,dir,plaintext,len,rfp); + if (!do_tmct(amode,akeysz,numkeys,aKey,iVec, + dir,plaintext,len,rfp)) + return -1; } else { assert(dir == 1); - ret = DESTest(&ctx, amode, akeysz, aKey, iVec, + DESTest(&ctx, amode, akeysz, aKey, iVec, dir, /* 0 = decrypt, 1 = encrypt */ ciphertext, plaintext, len); OutputValue("CIPHERTEXT",ciphertext,len,rfp, @@ -585,13 +586,13 @@ static int proc_file(char *rqfile, char *rspfile) PrintValue("CIPHERTEXT", ciphertext, len); if (strcmp(atest, "Monte") == 0) /* Monte Carlo Test */ { - do_mct(amode, akeysz, numkeys, aKey, iVec, + do_tmct(amode, akeysz, numkeys, aKey, iVec, dir, ciphertext, len, rfp); } else { assert(dir == 0); - ret = DESTest(&ctx, amode, akeysz, aKey, iVec, + DESTest(&ctx, amode, akeysz, aKey, iVec, dir, /* 0 = decrypt, 1 = encrypt */ plaintext, ciphertext, len); OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp, @@ -619,6 +620,7 @@ static int proc_file(char *rqfile, char *rspfile) fclose(rfp); if (afp) fclose(afp); + FIPS_cipher_ctx_cleanup(&ctx); return err; } @@ -631,12 +633,16 @@ static int proc_file(char *rqfile, char *rspfile) aes_test -d xxxxx.xxx The default is: -d req.txt --------------------------------------------------*/ +#ifdef FIPS_ALGVS +int fips_desmovs_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { char *rqlist = "req.txt", *rspfile = NULL; FILE *fp = NULL; char fn[250] = "", rfn[256] = ""; - int f_opt = 0, d_opt = 1; + int d_opt = 1; fips_algtest_init(); if (argc > 1) @@ -647,7 +653,6 @@ int main(int argc, char **argv) } else if (fips_strcasecmp(argv[1], "-f") == 0) { - f_opt = 1; d_opt = 0; } else @@ -680,10 +685,10 @@ int main(int argc, char **argv) strtok(fn, "\r\n"); strcpy(rfn, fn); printf("Processing: %s\n", rfn); - if (proc_file(rfn, rspfile)) + if (tproc_file(rfn, rspfile)) { printf(">>> Processing failed for: %s <<<\n", rfn); - EXIT(1); + return -1; } } fclose(fp); @@ -692,12 +697,11 @@ int main(int argc, char **argv) { if (VERBOSE) printf("Processing: %s\n", fn); - if (proc_file(fn, rspfile)) + if (tproc_file(fn, rspfile)) { printf(">>> Processing failed for: %s <<<\n", fn); } } - EXIT(0); return 0; } diff --git a/fips/dh/fips_dhvs.c b/fips/dh/fips_dhvs.c index ad760c8aaa..a925e13c7d 100644 --- a/fips/dh/fips_dhvs.c +++ b/fips/dh/fips_dhvs.c @@ -145,8 +145,12 @@ static void output_Zhash(FILE *out, int exout, OPENSSL_cleanse(Z, Zlen); OPENSSL_free(Z); } - -int main(int argc,char **argv) + +#ifdef FIPS_ALGVS +int fips_dhvs_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { char **args = argv + 1; int argn = argc - 1; @@ -275,10 +279,14 @@ int main(int argc,char **argv) rhash, rhashlen); } } + if (in && in != stdin) + fclose(in); + if (out && out != stdout) + fclose(out); return 0; parse_error: fprintf(stderr, "Error Parsing request file\n"); - exit(1); + return 1; } #endif diff --git a/fips/dsa/fips_dsa_sign.c b/fips/dsa/fips_dsa_sign.c index ea1bd87303..274bcd9016 100644 --- a/fips/dsa/fips_dsa_sign.c +++ b/fips/dsa/fips_dsa_sign.c @@ -114,4 +114,28 @@ int FIPS_dsa_verify_digest(DSA *dsa, return dsa->meth->dsa_do_verify(dig,dlen,s,dsa); } +int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash, DSA_SIG *s) + { + int ret=-1; + unsigned char dig[EVP_MAX_MD_SIZE]; + unsigned int dlen; + FIPS_digest(msg, msglen, dig, &dlen, mhash); + ret=FIPS_dsa_verify_digest(dsa, dig, dlen, s); + OPENSSL_cleanse(dig, dlen); + return ret; + } + +DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash) + { + DSA_SIG *s; + unsigned char dig[EVP_MAX_MD_SIZE]; + unsigned int dlen; + FIPS_digest(msg, msglen, dig, &dlen, mhash); + s = FIPS_dsa_sign_digest(dsa, dig, dlen); + OPENSSL_cleanse(dig, dlen); + return s; + } + #endif diff --git a/fips/dsa/fips_dsatest.c b/fips/dsa/fips_dsatest.c index 64d52258eb..3ea600e4ab 100644 --- a/fips/dsa/fips_dsatest.c +++ b/fips/dsa/fips_dsatest.c @@ -62,8 +62,10 @@ #include #include #include +#ifndef NO_SYS_TYPES_H #include #include +#endif #include "e_os.h" @@ -154,9 +156,7 @@ int main(int argc, char **argv) unsigned char buf[256]; unsigned long h; BN_GENCB cb; - EVP_MD_CTX mctx; BN_GENCB_set(&cb, dsa_cb, stderr); - FIPS_md_ctx_init(&mctx); fips_algtest_init(); @@ -210,19 +210,11 @@ int main(int argc, char **argv) } DSA_generate_key(dsa); - if (!FIPS_digestinit(&mctx, EVP_sha1())) - goto end; - if (!FIPS_digestupdate(&mctx, str1, 20)) - goto end; - sig = FIPS_dsa_sign_ctx(dsa, &mctx); + sig = FIPS_dsa_sign(dsa, str1, 20, EVP_sha1()); if (!sig) goto end; - if (!FIPS_digestinit(&mctx, EVP_sha1())) - goto end; - if (!FIPS_digestupdate(&mctx, str1, 20)) - goto end; - if (FIPS_dsa_verify_ctx(dsa, &mctx, sig) != 1) + if (FIPS_dsa_verify(dsa, str1, 20, EVP_sha1(), sig) != 1) goto end; ret = 1; @@ -231,7 +223,6 @@ end: if (sig) FIPS_dsa_sig_free(sig); if (dsa != NULL) FIPS_dsa_free(dsa); - FIPS_md_ctx_cleanup(&mctx); #if 0 CRYPTO_mem_leaks(bio_err); #endif diff --git a/fips/dsa/fips_dssvs.c b/fips/dsa/fips_dssvs.c index 45bca7c155..bd7055d463 100644 --- a/fips/dsa/fips_dssvs.c +++ b/fips/dsa/fips_dssvs.c @@ -46,7 +46,8 @@ static int parse_mod(char *line, int *pdsa2, int *pL, int *pN, if (strcmp(keyword, "L")) return 0; *pL = atoi(value); - strcpy(line, p + 1); + strcpy(lbuf, p + 1); + strcpy(line, lbuf); if (pmd) p = strchr(line, ','); else @@ -199,6 +200,7 @@ static void pqg(FILE *in, FILE *out) { fprintf(out, "counter = %d" RESP_EOL RESP_EOL, counter); } + FIPS_dsa_free(dsa); } } else if(!strcmp(keyword,"P")) @@ -519,6 +521,8 @@ static void keyver(FILE *in, FILE *out) BN_free(g); if (Y2) BN_free(Y2); + if (ctx) + BN_CTX_free(ctx); } static void keypair(FILE *in, FILE *out) @@ -549,6 +553,11 @@ static void keypair(FILE *in, FILE *out) int n=atoi(value); dsa = FIPS_dsa_new(); + if (!dsa) + { + fprintf(stderr, "DSA allocation error\n"); + exit(1); + } if (!dsa2 && !dsa_builtin_paramgen(dsa, L, N, NULL, NULL, 0, NULL, NULL, NULL, NULL)) { @@ -575,6 +584,7 @@ static void keypair(FILE *in, FILE *out) do_bn_print_name(out, "Y",dsa->pub_key); fputs(RESP_EOL, out); } + FIPS_dsa_free(dsa); } } } @@ -627,9 +637,7 @@ static void siggen(FILE *in, FILE *out) { unsigned char msg[1024]; int n; - EVP_MD_CTX mctx; DSA_SIG *sig; - FIPS_md_ctx_init(&mctx); n=hex2bin(value,msg); @@ -637,19 +645,16 @@ static void siggen(FILE *in, FILE *out) exit(1); do_bn_print_name(out, "Y",dsa->pub_key); - FIPS_digestinit(&mctx, md); - FIPS_digestupdate(&mctx, msg, n); - sig = FIPS_dsa_sign_ctx(dsa, &mctx); + sig = FIPS_dsa_sign(dsa, msg, n, md); do_bn_print_name(out, "R",sig->r); do_bn_print_name(out, "S",sig->s); fputs(RESP_EOL, out); FIPS_dsa_sig_free(sig); - FIPS_md_ctx_cleanup(&mctx); } } - if (dsa) - FIPS_dsa_free(dsa); + if (dsa) + FIPS_dsa_free(dsa); } static void sigver(FILE *in, FILE *out) @@ -687,37 +692,48 @@ static void sigver(FILE *in, FILE *out) dsa = FIPS_dsa_new(); } else if(!strcmp(keyword,"P")) - dsa->p=hex2bn(value); + do_hex2bn(&dsa->p, value); else if(!strcmp(keyword,"Q")) - dsa->q=hex2bn(value); + do_hex2bn(&dsa->q, value); else if(!strcmp(keyword,"G")) - dsa->g=hex2bn(value); + do_hex2bn(&dsa->g, value); else if(!strcmp(keyword,"Msg")) n=hex2bin(value,msg); else if(!strcmp(keyword,"Y")) - dsa->pub_key=hex2bn(value); + do_hex2bn(&dsa->pub_key, value); else if(!strcmp(keyword,"R")) sig->r=hex2bn(value); else if(!strcmp(keyword,"S")) { - EVP_MD_CTX mctx; int r; - FIPS_md_ctx_init(&mctx); sig->s=hex2bn(value); - FIPS_digestinit(&mctx, md); - FIPS_digestupdate(&mctx, msg, n); no_err = 1; - r = FIPS_dsa_verify_ctx(dsa, &mctx, sig); + r = FIPS_dsa_verify(dsa, msg, n, md, sig); no_err = 0; - FIPS_md_ctx_cleanup(&mctx); + if (sig->s) + { + BN_free(sig->s); + sig->s = NULL; + } + if (sig->r) + { + BN_free(sig->r); + sig->r = NULL; + } fprintf(out, "Result = %c" RESP_EOL RESP_EOL, r == 1 ? 'P' : 'F'); } } + if (dsa) + FIPS_dsa_free(dsa); } -int main(int argc,char **argv) +#ifdef FIPS_ALGVS +int fips_dssvs_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { FILE *in, *out; if (argc == 4) diff --git a/fips/ecdh/fips_ecdh_selftest.c b/fips/ecdh/fips_ecdh_selftest.c index 2b21ceaf48..0b16c57aae 100644 --- a/fips/ecdh/fips_ecdh_selftest.c +++ b/fips/ecdh/fips_ecdh_selftest.c @@ -166,6 +166,7 @@ int FIPS_selftest_ecdh(void) rv = -1; goto err; } + EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH); if (!EC_KEY_set_public_key_affine_coordinates(ec1, x, y)) { @@ -194,6 +195,7 @@ int FIPS_selftest_ecdh(void) rv = -1; goto err; } + EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH); if (!EC_KEY_set_public_key_affine_coordinates(ec2, x, y)) { diff --git a/fips/ecdh/fips_ecdhvs.c b/fips/ecdh/fips_ecdhvs.c index 72ebe815dd..a1422868b3 100644 --- a/fips/ecdh/fips_ecdhvs.c +++ b/fips/ecdh/fips_ecdhvs.c @@ -76,7 +76,7 @@ int main(int argc, char **argv) #include "fips_utl.h" -static const EVP_MD *parse_md(char *line) +static const EVP_MD *eparse_md(char *line) { char *p; if (line[0] != '[' || line[1] != 'E') @@ -261,6 +261,7 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group, unsigned char chash[EVP_MAX_MD_SIZE]; int Zlen; ec = EC_KEY_new(); + EC_KEY_set_flags(ec, EC_FLAG_COFACTOR_ECDH); EC_KEY_set_group(ec, group); peerkey = make_peer(group, cx, cy); if (rhash == NULL) @@ -301,7 +302,11 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group, EC_POINT_free(peerkey); } -int main(int argc,char **argv) +#ifdef FIPS_ALGVS +int fips_ecdhvs_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { char **args = argv + 1; int argn = argc - 1; @@ -315,6 +320,7 @@ int main(int argc,char **argv) EC_GROUP *group = NULL; char *keyword = NULL, *value = NULL; int do_verify = -1, exout = 0; + int rv = 1; int curve_nids[5] = {0,0,0,0,0}; int param_set = -1; @@ -408,11 +414,16 @@ int main(int argc,char **argv) if (group) EC_GROUP_free(group); group = EC_GROUP_new_by_curve_name(nid); + if (!group) + { + fprintf(stderr, "ERROR: unsupported curve %s\n", buf + 1); + return 1; + } } if (strlen(buf) > 6 && !strncmp(buf, "[E", 2)) { - md = parse_md(buf); + md = eparse_md(buf); if (md == NULL) goto parse_error; continue; @@ -459,10 +470,27 @@ int main(int argc,char **argv) md, rhash, rhashlen); } } - return 0; + rv = 0; parse_error: - fprintf(stderr, "Error Parsing request file\n"); - exit(1); + if (id) + BN_free(id); + if (ix) + BN_free(ix); + if (iy) + BN_free(iy); + if (cx) + BN_free(cx); + if (cy) + BN_free(cy); + if (group) + EC_GROUP_free(group); + if (in && in != stdin) + fclose(in); + if (out && out != stdout) + fclose(out); + if (rv) + fprintf(stderr, "Error Parsing request file\n"); + return rv; } #endif diff --git a/fips/ecdsa/fips_ecdsa_selftest.c b/fips/ecdsa/fips_ecdsa_selftest.c index 7d1007e19d..6ceb1c37b8 100644 --- a/fips/ecdsa/fips_ecdsa_selftest.c +++ b/fips/ecdsa/fips_ecdsa_selftest.c @@ -143,7 +143,7 @@ int FIPS_selftest_ecdsa() EC_KEY *ec = NULL; BIGNUM *x = NULL, *y = NULL, *d = NULL; EVP_PKEY pk; - int rv = 0; + int rv = 0, test_err = 0; size_t i; for (i = 0; i < sizeof(test_ec_data)/sizeof(EC_SELFTEST_DATA); i++) @@ -173,12 +173,12 @@ int FIPS_selftest_ecdsa() if (!fips_pkey_signature_test(FIPS_TEST_SIGNATURE, &pk, NULL, 0, NULL, 0, EVP_sha512(), 0, ecd->name)) - goto err; + test_err = 1; EC_KEY_free(ec); ec = NULL; } - - rv = 1; + if (test_err == 0) + rv = 1; err: diff --git a/fips/ecdsa/fips_ecdsa_sign.c b/fips/ecdsa/fips_ecdsa_sign.c index 0e86a50ef4..a7839ee592 100644 --- a/fips/ecdsa/fips_ecdsa_sign.c +++ b/fips/ecdsa/fips_ecdsa_sign.c @@ -87,3 +87,28 @@ int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s) return ret; } +int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen, + const EVP_MD *mhash, ECDSA_SIG *s) + { + int ret=-1; + unsigned char dig[EVP_MAX_MD_SIZE]; + unsigned int dlen; + FIPS_digest(msg, msglen, dig, &dlen, mhash); + ret=FIPS_ecdsa_verify_digest(key, dig, dlen, s); + OPENSSL_cleanse(dig, dlen); + return ret; + } + +ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key, + const unsigned char *msg, size_t msglen, + const EVP_MD *mhash) + { + ECDSA_SIG *s; + unsigned char dig[EVP_MAX_MD_SIZE]; + unsigned int dlen; + FIPS_digest(msg, msglen, dig, &dlen, mhash); + s = FIPS_ecdsa_sign_digest(key, dig, dlen); + OPENSSL_cleanse(dig, dlen); + return s; + } + diff --git a/fips/ecdsa/fips_ecdsavs.c b/fips/ecdsa/fips_ecdsavs.c index 898951a2c8..5745a6d37a 100644 --- a/fips/ecdsa/fips_ecdsavs.c +++ b/fips/ecdsa/fips_ecdsavs.c @@ -75,7 +75,7 @@ int main(int argc, char **argv) #include -static int lookup_curve(char *in, char *curve_name, const EVP_MD **pmd) +static int elookup_curve(char *in, char *curve_name, const EVP_MD **pmd) { char *cname, *p; /* Copy buffer as we will change it */ @@ -200,7 +200,7 @@ static int KeyPair(FILE *in, FILE *out) if (*buf == '[' && buf[2] == '-') { if (buf[2] == '-') - curve_nid = lookup_curve(buf, lbuf, NULL); + curve_nid = elookup_curve(buf, lbuf, NULL); fputs(buf, out); continue; } @@ -260,7 +260,7 @@ static int PKV(FILE *in, FILE *out) fputs(buf, out); if (*buf == '[' && buf[2] == '-') { - curve_nid = lookup_curve(buf, lbuf, NULL); + curve_nid = elookup_curve(buf, lbuf, NULL); if (curve_nid == NID_undef) return 0; @@ -287,10 +287,13 @@ static int PKV(FILE *in, FILE *out) no_err = 1; rv = EC_KEY_set_public_key_affine_coordinates(key, Qx, Qy); no_err = 0; + EC_KEY_free(key); fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F"); } } + BN_free(Qx); + BN_free(Qy); return 1; } @@ -305,8 +308,6 @@ static int SigGen(FILE *in, FILE *out) EC_KEY *key = NULL; ECDSA_SIG *sig = NULL; const EVP_MD *digest = NULL; - EVP_MD_CTX mctx; - EVP_MD_CTX_init(&mctx); Qx = BN_new(); Qy = BN_new(); while(fgets(buf, sizeof buf, in) != NULL) @@ -314,7 +315,7 @@ static int SigGen(FILE *in, FILE *out) fputs(buf, out); if (*buf == '[') { - curve_nid = lookup_curve(buf, lbuf, &digest); + curve_nid = elookup_curve(buf, lbuf, &digest); if (curve_nid == NID_undef) return 0; } @@ -342,9 +343,7 @@ static int SigGen(FILE *in, FILE *out) return 0; } - FIPS_digestinit(&mctx, digest); - FIPS_digestupdate(&mctx, msg, mlen); - sig = FIPS_ecdsa_sign_ctx(key, &mctx); + sig = FIPS_ecdsa_sign(key, msg, mlen, digest); if (!sig) { @@ -358,7 +357,7 @@ static int SigGen(FILE *in, FILE *out) do_bn_print_name(out, "S", sig->s); EC_KEY_free(key); - + OPENSSL_free(msg); FIPS_ecdsa_sig_free(sig); } @@ -366,7 +365,6 @@ static int SigGen(FILE *in, FILE *out) } BN_free(Qx); BN_free(Qy); - FIPS_md_ctx_cleanup(&mctx); return 1; } @@ -381,8 +379,6 @@ static int SigVer(FILE *in, FILE *out) EC_KEY *key = NULL; ECDSA_SIG sg, *sig = &sg; const EVP_MD *digest = NULL; - EVP_MD_CTX mctx; - EVP_MD_CTX_init(&mctx); sig->r = NULL; sig->s = NULL; while(fgets(buf, sizeof buf, in) != NULL) @@ -390,7 +386,7 @@ static int SigVer(FILE *in, FILE *out) fputs(buf, out); if (*buf == '[') { - curve_nid = lookup_curve(buf, lbuf, &digest); + curve_nid = elookup_curve(buf, lbuf, &digest); if (curve_nid == NID_undef) return 0; } @@ -447,20 +443,32 @@ static int SigVer(FILE *in, FILE *out) return 0; } - FIPS_digestinit(&mctx, digest); - FIPS_digestupdate(&mctx, msg, mlen); no_err = 1; - rv = FIPS_ecdsa_verify_ctx(key, &mctx, sig); + rv = FIPS_ecdsa_verify(key, msg, mlen, digest, sig); + EC_KEY_free(key); + if (msg) + OPENSSL_free(msg); no_err = 0; fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F"); } } + if (sig->r) + BN_free(sig->r); + if (sig->s) + BN_free(sig->s); + if (Qx) + BN_free(Qx); + if (Qy) + BN_free(Qy); return 1; } - +#ifdef FIPS_ALGVS +int fips_ecdsavs_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; const char *cmd = argv[1]; diff --git a/fips/fips.c b/fips/fips.c index 36ac8d1b0c..0269609a7e 100644 --- a/fips/fips.c +++ b/fips/fips.c @@ -81,7 +81,7 @@ static int fips_started = 0; static int fips_is_owning_thread(void); static int fips_set_owning_thread(void); static int fips_clear_owning_thread(void); -static unsigned char *fips_signature_witness(void); +static const unsigned char *fips_signature_witness(void); #define fips_w_lock() CRYPTO_w_lock(CRYPTO_LOCK_FIPS) #define fips_w_unlock() CRYPTO_w_unlock(CRYPTO_LOCK_FIPS) @@ -148,7 +148,10 @@ void fips_set_selftest_fail(void) extern const void *FIPS_text_start(), *FIPS_text_end(); extern const unsigned char FIPS_rodata_start[], FIPS_rodata_end[]; -unsigned char FIPS_signature [20] = { 0 }; +#ifdef _TMS320C6X +const +#endif +unsigned char FIPS_signature [20] = { 0, 0xff }; __fips_constseg static const char FIPS_hmac_key[]="etaonrishdlcupfm"; @@ -413,9 +416,8 @@ int fips_clear_owning_thread(void) return ret; } -unsigned char *fips_signature_witness(void) +const unsigned char *fips_signature_witness(void) { - extern unsigned char FIPS_signature[]; return FIPS_signature; } diff --git a/fips/fips.h b/fips/fips.h index 4cadbd26fd..b6263575c3 100644 --- a/fips/fips.h +++ b/fips/fips.h @@ -97,9 +97,8 @@ int FIPS_selftest_rsa(void); int FIPS_selftest_dsa(void); int FIPS_selftest_ecdsa(void); int FIPS_selftest_ecdh(void); -void FIPS_corrupt_drbg(void); -void FIPS_x931_stick(void); -void FIPS_drbg_stick(void); +void FIPS_x931_stick(int onoff); +void FIPS_drbg_stick(int onoff); int FIPS_selftest_x931(void); int FIPS_selftest_hmac(void); int FIPS_selftest_drbg(void); @@ -224,6 +223,16 @@ int FIPS_rsa_verify_digest(struct rsa_st *rsa, const struct env_md_st *mgf1Hash, const unsigned char *sigbuf, unsigned int siglen); +int FIPS_rsa_sign(struct rsa_st *rsa, const unsigned char *msg, int msglen, + const struct env_md_st *mhash, int rsa_pad_mode, + int saltlen, const struct env_md_st *mgf1Hash, + unsigned char *sigret, unsigned int *siglen); + +int FIPS_rsa_verify(struct rsa_st *rsa, const unsigned char *msg, int msglen, + const struct env_md_st *mhash, int rsa_pad_mode, + int saltlen, const struct env_md_st *mgf1Hash, + const unsigned char *sigbuf, unsigned int siglen); + #ifdef OPENSSL_FIPSCAPABLE int FIPS_digestinit(EVP_MD_CTX *ctx, const EVP_MD *type); diff --git a/fips/fips_canister.c b/fips/fips_canister.c index 7d67d32d6c..daf53cb40d 100644 --- a/fips/fips_canister.c +++ b/fips/fips_canister.c @@ -29,11 +29,15 @@ const void *FIPS_text_end(void); #if !defined(FIPS_REF_POINT_IS_CROSS_COMPILER_AWARE) # if (defined(__ANDROID__) && (defined(__arm__) || defined(__arm) || \ + defined(__aarch64__) || \ defined(__i386__)|| defined(__i386))) || \ (defined(__vxworks) && (defined(__ppc__) || defined(__ppc) || \ defined(__mips__)|| defined(__mips))) || \ + (defined(__NetBSD__) && (defined(__powerpc__) || defined(__i386))) || \ (defined(__linux) && ((defined(__PPC__) && !defined(__PPC64__)) || \ defined(__arm__) || defined(__arm))) || \ + (defined(__APPLE__) /* verified on all MacOS X & iOS flavors */)|| \ + (defined(_TMS320C6X)) || \ (defined(_WIN32) && defined(_MSC_VER)) # define FIPS_REF_POINT_IS_CROSS_COMPILER_AWARE # endif @@ -69,6 +73,10 @@ const unsigned int FIPS_text_startX[]= # pragma const_seg("fipsro$a") # pragma const_seg() __declspec(allocate("fipsro$a")) +# elif defined(_TMS320C6X) +# pragma CODE_SECTION(instruction_pointer,".fips_text:start") +# pragma CODE_SECTION(FIPS_ref_point,".fips_text:start") +# pragma DATA_SECTION(FIPS_rodata_start,".fips_const:start") # endif const unsigned int FIPS_rodata_start[]= { 0x46495053, 0x5f726f64, 0x6174615f, 0x73746172 }; @@ -86,6 +94,10 @@ const unsigned int FIPS_text_endX[]= # pragma const_seg("fipsro$z") # pragma const_seg() __declspec(allocate("fipsro$z")) +# elif defined(_TMS320C6X) +# pragma CODE_SECTION(instruction_pointer,".fips_text:end") +# pragma CODE_SECTION(FIPS_ref_point,".fips_text:end") +# pragma DATA_SECTION(FIPS_rodata_end,".fips_const:end") # endif const unsigned int FIPS_rodata_end[]= { 0x46495053, 0x5f726f64, 0x6174615f, 0x656e645b }; diff --git a/fips/fips_locl.h b/fips/fips_locl.h index df3863f91e..6efa93194e 100644 --- a/fips/fips_locl.h +++ b/fips/fips_locl.h @@ -67,8 +67,8 @@ int fips_post_failed(int id, int subid, void *ex); int fips_post_corrupt(int id, int subid, void *ex); int fips_post_status(void); -#define FIPS_MODULE_VERSION_NUMBER 0x20000000L -#define FIPS_MODULE_VERSION_TEXT "FIPS 2.0-dev unvalidated test module xx XXX xxxx" +#define FIPS_MODULE_VERSION_NUMBER 0x20000009L +#define FIPS_MODULE_VERSION_TEXT "FIPS 2.0-rc9 unvalidated test module xx XXX xxxx" #ifdef __cplusplus } diff --git a/fips/fips_post.c b/fips/fips_post.c index e55ec08407..8cd2334362 100644 --- a/fips/fips_post.c +++ b/fips/fips_post.c @@ -207,7 +207,6 @@ int fips_pkey_signature_test(int id, EVP_PKEY *pkey, const char *fail_str) { int subid; - void *ex = NULL; int ret = 0; unsigned char *sig = NULL; unsigned int siglen; @@ -335,7 +334,7 @@ int fips_pkey_signature_test(int id, EVP_PKEY *pkey, FIPSerr(FIPS_F_FIPS_PKEY_SIGNATURE_TEST,FIPS_R_TEST_FAILURE); if (fail_str) FIPS_add_error_data(2, "Type=", fail_str); - fips_post_failed(id, subid, ex); + fips_post_failed(id, subid, pkey); return 0; } return fips_post_success(id, subid, pkey); diff --git a/fips/fips_premain.c b/fips/fips_premain.c index a7c8b78f8f..b6ec32db4e 100644 --- a/fips/fips_premain.c +++ b/fips/fips_premain.c @@ -7,7 +7,7 @@ #include #include #include -#if defined(__unix) || defined(__unix__) || defined(__vxworks) || defined(__ANDROID__) +#if defined(__unix) || defined(__unix__) || defined(__vxworks) || defined(__ANDROID__) || defined(__APPLE__) #include #endif @@ -53,6 +53,12 @@ int lib$initialize(); globaldef int (*lib_init_ref)() = lib$initialize; # pragma __standard +#elif defined(_TMS320C6X) +# if defined(__TI_EABI__) + asm("\t.sect \".init_array\"\n\t.align 4\n\t.field FINGERPRINT_premain,32"); +# else + asm("\t.sect \".pinit\"\n\t.align 4\n\t.field _FINGERPRINT_premain,32"); +# endif #elif 0 The rest has to be taken care of through command line: @@ -134,6 +140,9 @@ void FINGERPRINT_premain(void) } #endif } while(0); +#if defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC) + fips_openssl_cpuid_setup(); +#endif } #else diff --git a/fips/fips_premain.c.sha1 b/fips/fips_premain.c.sha1 index e0332e8afd..19c30807a7 100644 --- a/fips/fips_premain.c.sha1 +++ b/fips/fips_premain.c.sha1 @@ -1 +1 @@ -HMAC-SHA1(fips_premain.c)= a401afd9c2b57f0f11d2b34b6d0c9815b1fe6a66 +HMAC-SHA1(fips_premain.c)= 2bfb57ef540bdba29220a45d65e1b4080de9adc1 diff --git a/fips/fips_test_suite.c b/fips/fips_test_suite.c index b7aea4e9cd..cd4aafbd12 100644 --- a/fips/fips_test_suite.c +++ b/fips/fips_test_suite.c @@ -40,12 +40,46 @@ int main(int argc, char *argv[]) #include #include +#include #include #include #include #include "fips_utl.h" +static int verbose = 0; + +static int fips_module_mode_set_verbose(int mode, const char *pass) + { + int rv; + if (verbose) + printf("Attempting to %s FIPS mode\n", mode ? "Enter" : "Leave"); + rv = FIPS_module_mode_set(mode, pass); + if (verbose) + printf("FIPS_module_mode() returned %d\n", FIPS_module_mode()); + return rv; + } + +static void do_print_rsa_key(RSA *rsa) + { + if (!verbose) + return; + do_bn_print_name(stdout, "RSA key modulus value", rsa->e); + do_bn_print_name(stdout, "RSA key publicExponent value", rsa->n); + do_bn_print_name(stdout, "RSA key pricateExponent value", rsa->d); + do_bn_print_name(stdout, "RSA key prime1 value", rsa->p); + do_bn_print_name(stdout, "RSA key prime2 value", rsa->q); + do_bn_print_name(stdout, "RSA key exponent1 value", rsa->dmp1); + do_bn_print_name(stdout, "RSA key exponent2 value", rsa->dmq1); + do_bn_print_name(stdout, "RSA key coefficient value", rsa->iqmp); + } + +static void do_print_buf(char *name, unsigned char *buf, int buflen) + { + if (verbose) + OutputValue(name, buf, buflen, stdout, 0); + } + /* AES: encrypt and decrypt known plaintext, verify result matches original plaintext */ static int FIPS_aes_test(void) @@ -57,14 +91,30 @@ static int FIPS_aes_test(void) unsigned char plaintext[16] = "etaonrishdlcu"; EVP_CIPHER_CTX ctx; FIPS_cipher_ctx_init(&ctx); + if (verbose) + { + do_print_buf("Key", key, sizeof(key)); + do_print_buf("Plaintext", plaintext, sizeof(plaintext)); + } if (FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 1) <= 0) goto err; FIPS_cipher(&ctx, citmp, plaintext, 16); + if (verbose) + { + do_print_buf("Ciphertext", citmp, sizeof(plaintext)); + printf("AES 128 bit ECB mode decryption started\n"); + } if (FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 0) <= 0) goto err; FIPS_cipher(&ctx, pltmp, citmp, 16); + do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext)); if (memcmp(pltmp, plaintext, 16)) + { + printf("Comparison failure!!\n"); goto err; + } + if (verbose) + printf("Comparison success.\n"); ret = 1; err: FIPS_cipher_ctx_cleanup(&ctx); @@ -83,6 +133,13 @@ static int FIPS_aes_gcm_test(void) unsigned char plaintext[16] = "etaonrishdlcu"; EVP_CIPHER_CTX ctx; FIPS_cipher_ctx_init(&ctx); + if (verbose) + { + do_print_buf("Key", key, sizeof(key)); + do_print_buf("IV", key, sizeof(iv)); + do_print_buf("Plaintext", plaintext, sizeof(plaintext)); + do_print_buf("AAD", aad, sizeof(aad)); + } if (FIPS_cipherinit(&ctx, EVP_aes_128_gcm(), key, iv, 1) <= 0) goto err; FIPS_cipher(&ctx, NULL, aad, sizeof(aad)); @@ -91,6 +148,12 @@ static int FIPS_aes_gcm_test(void) if (!FIPS_cipher_ctx_ctrl(&ctx, EVP_CTRL_GCM_GET_TAG, 16, tagtmp)) goto err; + if (verbose) + { + do_print_buf("Ciphertext", citmp, sizeof(citmp)); + do_print_buf("Tag", tagtmp, sizeof(tagtmp)); + } + if (FIPS_cipherinit(&ctx, EVP_aes_128_gcm(), key, iv, 0) <= 0) goto err; if (!FIPS_cipher_ctx_ctrl(&ctx, EVP_CTRL_GCM_SET_TAG, 16, tagtmp)) @@ -103,8 +166,17 @@ static int FIPS_aes_gcm_test(void) if (FIPS_cipher(&ctx, NULL, NULL, 0) < 0) goto err; + if (verbose) + do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext)); + if (memcmp(pltmp, plaintext, 16)) + { + if (verbose) + printf("Comparison failure!!\n"); goto err; + } + + printf("Comparison sucess.\n"); ret = 1; err: @@ -122,20 +194,110 @@ static int FIPS_des3_test(void) unsigned char plaintext[] = { 'e', 't', 'a', 'o', 'n', 'r', 'i', 's' }; EVP_CIPHER_CTX ctx; FIPS_cipher_ctx_init(&ctx); + if (verbose) + { + do_print_buf("Key", key, sizeof(key)); + do_print_buf("Plaintext", plaintext, sizeof(plaintext)); + } if (FIPS_cipherinit(&ctx, EVP_des_ede3_ecb(), key, NULL, 1) <= 0) goto err; FIPS_cipher(&ctx, citmp, plaintext, 8); + if (verbose) + { + do_print_buf("Ciphertext", citmp, sizeof(plaintext)); + printf("DES3 ECB mode decryption\n"); + } if (FIPS_cipherinit(&ctx, EVP_des_ede3_ecb(), key, NULL, 0) <= 0) goto err; FIPS_cipher(&ctx, pltmp, citmp, 8); + if (verbose) + do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext)); if (memcmp(pltmp, plaintext, 8)) + { + if (verbose) + printf("Comparison failure!!\n"); + goto err; + } + if (verbose) + printf("Comparison success\n"); ret = 1; err: FIPS_cipher_ctx_cleanup(&ctx); return ret; } +/* + * ECDSA: generate keys and sign, verify input plaintext. + */ +static int FIPS_ecdsa_test(void) + { + EC_KEY *ec = NULL; + unsigned char dgst[] = "etaonrishdlc"; + int r = 0; + ECDSA_SIG *sig = NULL; + + ERR_clear_error(); + ec = FIPS_ec_key_new_by_curve_name(NID_X9_62_prime256v1); + if (!ec) + goto end; + if (!FIPS_ec_key_generate_key(ec)) + goto end; + + if (verbose) + { + BIGNUM *Qx, *Qy; + BN_CTX *ctx; + const EC_GROUP *grp; + const EC_POINT *pt; + const BIGNUM *priv; + Qx = BN_new(); + Qy = BN_new(); + ctx = BN_CTX_new(); + grp = EC_KEY_get0_group(ec); + pt = EC_KEY_get0_public_key(ec); + priv = EC_KEY_get0_private_key(ec); + printf("EC Key using P-256\n"); + if (!EC_POINT_get_affine_coordinates_GFp(grp, pt, Qx, Qy, ctx)) + goto end; + + do_bn_print_name(stdout, "ECDSA key x coordinate", Qx); + do_bn_print_name(stdout, "ECDSA key y coordinate", Qy); + do_bn_print_name(stdout, "ECDSA key private value", priv); + BN_free(Qx); + BN_free(Qy); + BN_CTX_free(ctx); + printf("Signing string \"%s\" using SHA256\n", dgst); + } + + sig = FIPS_ecdsa_sign(ec, dgst, sizeof(dgst) -1, EVP_sha256()); + if (!sig) + { + if (verbose) + printf("Signing Failed!!\n"); + goto end; + } + + if (verbose) + { + printf("Signing successful\n"); + do_bn_print_name(stdout, "ECDSA signature r value", sig->r); + do_bn_print_name(stdout, "ECDSA signature s value", sig->s); + } + + r = FIPS_ecdsa_verify(ec, dgst, sizeof(dgst) -1, EVP_sha256(), sig); + if (verbose) + printf("ECDSA verification %s\n", r ? "Successful." : "Failed!!"); + end: + if (sig) + FIPS_ecdsa_sig_free(sig); + if (ec) + FIPS_ec_key_free(ec); + if (r != 1) + return 0; + return 1; + } + /* * DSA: generate keys and sign, verify input plaintext. */ @@ -144,11 +306,9 @@ static int FIPS_dsa_test(int bad) DSA *dsa = NULL; unsigned char dgst[] = "etaonrishdlc"; int r = 0; - EVP_MD_CTX mctx; DSA_SIG *sig = NULL; ERR_clear_error(); - FIPS_md_ctx_init(&mctx); dsa = FIPS_dsa_new(); if (!dsa) goto end; @@ -159,23 +319,37 @@ static int FIPS_dsa_test(int bad) if (bad) BN_add_word(dsa->pub_key, 1); - if (!FIPS_digestinit(&mctx, EVP_sha256())) - goto end; - if (!FIPS_digestupdate(&mctx, dgst, sizeof(dgst) - 1)) - goto end; - sig = FIPS_dsa_sign_ctx(dsa, &mctx); - if (!sig) - goto end; + if (verbose) + { + do_bn_print_name(stdout, "DSA key p value", dsa->p); + do_bn_print_name(stdout, "DSA key q value", dsa->q); + do_bn_print_name(stdout, "DSA key g value", dsa->g); + do_bn_print_name(stdout, "DSA key public_key value", dsa->pub_key); + do_bn_print_name(stdout, "DSA key private key value", dsa->priv_key); + printf("Signing string \"%s\" using SHA256\n", dgst); + } - if (!FIPS_digestinit(&mctx, EVP_sha256())) + sig = FIPS_dsa_sign(dsa, dgst, sizeof(dgst) -1, EVP_sha256()); + if (!sig) + { + if (verbose) + printf("Signing Failed!!\n"); goto end; - if (!FIPS_digestupdate(&mctx, dgst, sizeof(dgst) - 1)) - goto end; - r = FIPS_dsa_verify_ctx(dsa, &mctx, sig); + } + + if (verbose) + { + printf("Signing successful\n"); + do_bn_print_name(stdout, "DSA signature r value", sig->r); + do_bn_print_name(stdout, "DSA signature s value", sig->s); + } + + r = FIPS_dsa_verify(dsa, dgst, sizeof(dgst) -1, EVP_sha256(), sig); + if (verbose) + printf("DSA verification %s\n", r ? "Successful." : "Failed!!"); end: if (sig) FIPS_dsa_sig_free(sig); - FIPS_md_ctx_cleanup(&mctx); if (dsa) FIPS_dsa_free(dsa); if (r != 1) @@ -193,11 +367,9 @@ static int FIPS_rsa_test(int bad) unsigned char buf[256]; unsigned int slen; BIGNUM *bn; - EVP_MD_CTX mctx; int r = 0; ERR_clear_error(); - FIPS_md_ctx_init(&mctx); key = FIPS_rsa_new(); bn = BN_new(); if (!key || !bn) @@ -209,20 +381,31 @@ static int FIPS_rsa_test(int bad) if (bad) BN_add_word(key->n, 1); - if (!FIPS_digestinit(&mctx, EVP_sha256())) - goto end; - if (!FIPS_digestupdate(&mctx, input_ptext, sizeof(input_ptext) - 1)) - goto end; - if (!FIPS_rsa_sign_ctx(key, &mctx, RSA_PKCS1_PADDING, 0, NULL, buf, &slen)) - goto end; + if (verbose) + { + do_print_rsa_key(key); + printf("Signing string \"%s\" using SHA256\n", input_ptext); + } - if (!FIPS_digestinit(&mctx, EVP_sha256())) + if (!FIPS_rsa_sign(key, input_ptext, sizeof(input_ptext) - 1, EVP_sha256(), + RSA_PKCS1_PADDING, 0, NULL, buf, &slen)) + { + if (verbose) + printf("RSA Signing failed!!\n"); goto end; - if (!FIPS_digestupdate(&mctx, input_ptext, sizeof(input_ptext) - 1)) - goto end; - r = FIPS_rsa_verify_ctx(key, &mctx, RSA_PKCS1_PADDING, 0, NULL, buf, slen); + } + + if (verbose) + { + printf("RSA signing successul\n"); + do_print_buf("RSA signature", buf, slen); + } + + r = FIPS_rsa_verify(key, input_ptext, sizeof(input_ptext) - 1, EVP_sha256(), + RSA_PKCS1_PADDING, 0, NULL, buf, slen); + if (verbose) + printf("RSA Verification %s\n", r == 1 ? "Successful" : "Failed!!"); end: - FIPS_md_ctx_cleanup(&mctx); if (key) FIPS_rsa_free(key); if (r != 1) @@ -243,6 +426,11 @@ static int FIPS_sha1_test() ERR_clear_error(); if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha1())) return 0; + if (verbose) + { + printf("Digesting string %s\n", str); + do_print_buf("Digest value", md, sizeof(md)); + } if (memcmp(md,digest,sizeof(md))) return 0; return 1; @@ -262,6 +450,11 @@ static int FIPS_sha256_test() ERR_clear_error(); if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha256())) return 0; + if (verbose) + { + printf("Digesting string %s\n", str); + do_print_buf("Digest value", md, sizeof(md)); + } if (memcmp(md,digest,sizeof(md))) return 0; return 1; @@ -283,6 +476,11 @@ static int FIPS_sha512_test() ERR_clear_error(); if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha512())) return 0; + if (verbose) + { + printf("Digesting string %s\n", str); + do_print_buf("Digest value", md, sizeof(md)); + } if (memcmp(md,digest,sizeof(md))) return 0; return 1; @@ -304,8 +502,19 @@ static int FIPS_hmac_sha1_test() ERR_clear_error(); if (!HMAC(EVP_sha1(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0; + if (verbose) + { + do_print_buf("HMAC key", key, sizeof(key) -1); + do_print_buf("HMAC input", iv, sizeof(iv) -1); + do_print_buf("HMAC output", out, outlen); + } if (memcmp(out,kaval,outlen)) + { + if (verbose) + printf("HMAC comparison failed!!\n"); return 0; + } + printf("HMAC comparison successful.\n"); return 1; } @@ -325,6 +534,19 @@ static int FIPS_hmac_sha224_test() ERR_clear_error(); if (!HMAC(EVP_sha224(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0; + if (verbose) + { + do_print_buf("HMAC key", key, sizeof(key) -1); + do_print_buf("HMAC input", iv, sizeof(iv) -1); + do_print_buf("HMAC output", out, outlen); + } + if (memcmp(out,kaval,outlen)) + { + if (verbose) + printf("HMAC comparison failed!!\n"); + return 0; + } + printf("HMAC comparison successful.\n"); if (memcmp(out,kaval,outlen)) return 0; return 1; @@ -346,8 +568,19 @@ static int FIPS_hmac_sha256_test() ERR_clear_error(); if (!HMAC(EVP_sha256(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0; + if (verbose) + { + do_print_buf("HMAC key", key, sizeof(key) -1); + do_print_buf("HMAC input", iv, sizeof(iv) -1); + do_print_buf("HMAC output", out, outlen); + } if (memcmp(out,kaval,outlen)) + { + if (verbose) + printf("HMAC comparison failed!!\n"); return 0; + } + printf("HMAC comparison successful.\n"); return 1; } @@ -368,8 +601,19 @@ static int FIPS_hmac_sha384_test() ERR_clear_error(); if (!HMAC(EVP_sha384(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0; + if (verbose) + { + do_print_buf("HMAC key", key, sizeof(key) -1); + do_print_buf("HMAC input", iv, sizeof(iv) -1); + do_print_buf("HMAC output", out, outlen); + } if (memcmp(out,kaval,outlen)) + { + if (verbose) + printf("HMAC comparison failed!!\n"); return 0; + } + printf("HMAC comparison successful.\n"); return 1; } @@ -391,8 +635,19 @@ static int FIPS_hmac_sha512_test() ERR_clear_error(); if (!HMAC(EVP_sha512(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0; + if (verbose) + { + do_print_buf("HMAC key", key, sizeof(key) -1); + do_print_buf("HMAC input", iv, sizeof(iv) -1); + do_print_buf("HMAC output", out, outlen); + } if (memcmp(out,kaval,outlen)) + { + if (verbose) + printf("HMAC comparison failed!!\n"); return 0; + } + printf("HMAC comparison successful.\n"); return 1; } @@ -427,18 +682,15 @@ static int FIPS_cmac_aes128_test() out = OPENSSL_malloc(outlen); if (!CMAC_Final(ctx, out, &outlen)) goto end; -#if 0 - { - char *hexout = OPENSSL_malloc(outlen * 2 + 1); - bin2hex(out, outlen, hexout); - printf("CMAC-AES128: res = %s\n", hexout); - OPENSSL_free(hexout); - } - r = 1; -#else if (!memcmp(out,kaval,outlen)) r = 1; -#endif + if (verbose) + { + do_print_buf("CMAC key", key, sizeof(key)); + do_print_buf("CMAC input", data, sizeof(data) -1); + do_print_buf("CMAC output", out, outlen); + printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!"); + } end: CMAC_CTX_free(ctx); if (out) @@ -478,18 +730,15 @@ static int FIPS_cmac_aes192_test() out = OPENSSL_malloc(outlen); if (!CMAC_Final(ctx, out, &outlen)) goto end; -#if 0 - { - char *hexout = OPENSSL_malloc(outlen * 2 + 1); - bin2hex(out, outlen, hexout); - printf("CMAC-AES192: res = %s\n", hexout); - OPENSSL_free(hexout); - } - r = 1; -#else if (!memcmp(out,kaval,outlen)) r = 1; -#endif + if (verbose) + { + do_print_buf("CMAC key", key, sizeof(key)); + do_print_buf("CMAC input", data, sizeof(data) -1); + do_print_buf("CMAC output", out, outlen); + printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!"); + } end: CMAC_CTX_free(ctx); if (out) @@ -530,18 +779,15 @@ static int FIPS_cmac_aes256_test() out = OPENSSL_malloc(outlen); if (!CMAC_Final(ctx, out, &outlen)) goto end; -#if 0 - { - char *hexout = OPENSSL_malloc(outlen * 2 + 1); - bin2hex(out, outlen, hexout); - printf("CMAC-AES256: res = %s\n", hexout); - OPENSSL_free(hexout); - } - r = 1; -#else if (!memcmp(out,kaval,outlen)) r = 1; -#endif + if (verbose) + { + do_print_buf("CMAC key", key, sizeof(key)); + do_print_buf("CMAC input", data, sizeof(data) -1); + do_print_buf("CMAC output", out, outlen); + printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!"); + } end: CMAC_CTX_free(ctx); if (out) @@ -580,18 +826,15 @@ static int FIPS_cmac_tdea3_test() out = OPENSSL_malloc(outlen); if (!CMAC_Final(ctx, out, &outlen)) goto end; -#if 0 - { - char *hexout = OPENSSL_malloc(outlen * 2 + 1); - bin2hex(out, outlen, hexout); - printf("CMAC-TDEA3: res = %s\n", hexout); - OPENSSL_free(hexout); - } - r = 1; -#else if (!memcmp(out,kaval,outlen)) r = 1; -#endif + if (verbose) + { + do_print_buf("CMAC key", key, sizeof(key)); + do_print_buf("CMAC input", data, sizeof(data) -1); + do_print_buf("CMAC output", out, outlen); + printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!"); + } end: CMAC_CTX_free(ctx); if (out) @@ -647,9 +890,15 @@ static int Zeroize() for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]); printf("\n"); RAND_bytes(userkey, sizeof userkey); - printf("\tchar buffer key after overwriting: \n\t\t"); + printf("\tchar buffer key after overwriting with random key: \n\t\t"); for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]); printf("\n"); + OPENSSL_cleanse(userkey, sizeof(userkey)); + printf("\tchar buffer key after zeroization: \n\t\t"); + for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]); + printf("\n"); + + FIPS_rsa_free(key); return 1; } @@ -668,6 +917,13 @@ static size_t drbg_test_cb(DRBG_CTX *ctx, unsigned char **pout, return (min_len + 0xf) & ~0xf; } +/* Callback which returns 0 to indicate entropy source failure */ +static size_t drbg_fail_cb(DRBG_CTX *ctx, unsigned char **pout, + int entropy, size_t min_len, size_t max_len) + { + return 0; + } + /* DRBG test: just generate lots of data and trigger health checks */ static int do_drbg_test(int type, int flags) @@ -696,7 +952,7 @@ static int do_drbg_test(int type, int flags) } rv = 1; err: - FIPS_drbg_uninstantiate(dctx); + FIPS_drbg_free(dctx); return rv; } @@ -758,9 +1014,13 @@ static const char * Fail(const char *msg) return msg; } -static void test_msg(const char *msg, int result) - { - printf("%s...%s\n", msg, result ? "successful" : Fail("Failed!")); +#define test_msg(msg, rtest) \ + { \ + int rv; \ + if (verbose) \ + printf("%s...started\n", msg); \ + rv = rtest; \ + printf("%s...%s\n", msg, rv ? "successful" : Fail("Failed!")); \ } /* Table of IDs for POST translating between NIDs and names */ @@ -821,12 +1081,17 @@ static const char *lookup_id(int id) static int fail_id = -1; static int fail_sub = -1; static int fail_key = -1; +static int sub_num = -1, sub_count = -1; +static int sub_fail_num = -1; + +static int st_err, post_quiet = 0; static int post_cb(int op, int id, int subid, void *ex) { const char *idstr, *exstr = ""; - char asctmp[20]; + char asctmp[20], teststr[80]; int keytype = -1; + int exp_fail = 0; #ifdef FIPS_POST_TIME static struct timespec start, end, tstart, tend; #endif @@ -938,6 +1203,21 @@ static int post_cb(int op, int id, int subid, void *ex) } + if (fail_id == id + && (fail_key == -1 || fail_key == keytype) + && (fail_sub == -1 || fail_sub == subid)) + exp_fail = 1; + + if (sub_num > 0) + { + if (sub_fail_num == sub_num) + exp_fail = 1; + sprintf(teststr, "\t\t%s %s (POST subtest #%d) test", + idstr, exstr, sub_num); + } + else + sprintf(teststr, "\t\t%s %s test", idstr, exstr); + switch(op) { case FIPS_POST_BEGIN: @@ -948,9 +1228,16 @@ static int post_cb(int op, int id, int subid, void *ex) clock_gettime(CLOCK_REALTIME, &tstart); #endif printf("\tPOST started\n"); + sub_num = 1; break; case FIPS_POST_END: + if (sub_count == -1) + sub_count = sub_num; + else if (sub_num != sub_count) + printf("Inconsistent POST count %d != %d\n", + sub_num, sub_count); + sub_num = -1; printf("\tPOST %s\n", id ? "Success" : "Failed"); #ifdef FIPS_POST_TIME clock_gettime(CLOCK_REALTIME, &tend); @@ -961,14 +1248,23 @@ static int post_cb(int op, int id, int subid, void *ex) break; case FIPS_POST_STARTED: - printf("\t\t%s %s test started\n", idstr, exstr); + if (!post_quiet && !exp_fail) + printf("%s started\n", teststr); #ifdef FIPS_POST_TIME clock_gettime(CLOCK_REALTIME, &start); #endif break; case FIPS_POST_SUCCESS: - printf("\t\t%s %s test OK\n", idstr, exstr); + if (sub_num > 0) + sub_num++; + if (exp_fail) + { + printf("%s OK but should've failed\n", teststr); + st_err++; + } + else if (!post_quiet) + printf("%s OK\n", teststr); #ifdef FIPS_POST_TIME clock_gettime(CLOCK_REALTIME, &end); printf("\t\t\tTook %f seconds\n", @@ -978,15 +1274,21 @@ static int post_cb(int op, int id, int subid, void *ex) break; case FIPS_POST_FAIL: - printf("\t\t%s %s test FAILED!!\n", idstr, exstr); + if (sub_num > 0) + sub_num++; + if (exp_fail) + printf("%s failed as expected\n", teststr); + else + { + printf("%s Failed Incorrectly!!\n", teststr); + st_err++; + } break; case FIPS_POST_CORRUPT: - if (fail_id == id - && (fail_key == -1 || fail_key == keytype) - && (fail_sub == -1 || fail_sub == subid)) + if (exp_fail) { - printf("\t\t%s %s test failure induced\n", idstr, exstr); + printf("%s failure induced\n", teststr); return 0; } break; @@ -995,110 +1297,433 @@ static int post_cb(int op, int id, int subid, void *ex) return 1; } -int main(int argc,char **argv) +static int do_fail_all(int fullpost, int fullerr) + { + int rv; + size_t i; + int sub_fail; + RSA *rsa = NULL; + DSA *dsa = NULL; + DRBG_CTX *dctx = NULL, *defctx = NULL; + EC_KEY *ec = NULL; + BIGNUM *bn = NULL; + unsigned char key[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + EVP_CIPHER_CTX ctx; + unsigned char out[10]; + if (!fullpost) + post_quiet = 1; + if (!fullerr) + no_err = 1; + fips_module_mode_set_verbose(0, NULL); + for (sub_fail = 1; sub_fail < sub_count; sub_fail++) + { + sub_fail_num = sub_fail; + printf(" Testing induced failure of POST subtest %d\n", + sub_fail); + rv = fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS); + if (rv) + { + printf("\tFIPS mode incorrectly successful!!\n"); + st_err++; + } + printf("\tAttempting crypto operation after failed POST... "); + FIPS_cipher_ctx_init(&ctx); + rv = FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 1); + if (rv > 0) + { + printf("succeeded incorrectly!!\n"); + st_err++; + } + else + printf("failed as expected.\n"); + FIPS_cipher_ctx_cleanup(&ctx); + } + sub_fail_num = -1; + printf(" Testing induced failure of RSA keygen test\n"); + /* NB POST will succeed with a pairwise test failures as + * it is not used during POST. + */ + fail_id = FIPS_TEST_PAIRWISE; + fail_key = EVP_PKEY_RSA; + /* Now enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + + rsa = FIPS_rsa_new(); + bn = BN_new(); + if (!rsa || !bn) + return 0; + BN_set_word(bn, 65537); + if (RSA_generate_key_ex(rsa, 2048,bn,NULL)) + { + printf("\tRSA key generated OK incorrectly!!\n"); + st_err++; + } + else + printf("\tRSA key generation failed as expected.\n"); + + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + + printf(" Testing induced failure of DSA keygen test\n"); + fail_key = EVP_PKEY_DSA; + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + dsa = FIPS_dsa_new(); + if (!dsa) + return 0; + if (!DSA_generate_parameters_ex(dsa, 1024,NULL,0,NULL,NULL,NULL)) + return 0; + if (DSA_generate_key(dsa)) + { + printf("\tDSA key generated OK incorrectly!!\n"); + st_err++; + } + else + printf("\tDSA key generation failed as expected.\n"); + + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + + printf(" Testing induced failure of ECDSA keygen test\n"); + fail_key = EVP_PKEY_EC; + + ec = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); + + if (!ec) + return 0; + + if (EC_KEY_generate_key(ec)) + { + printf("\tECDSA key generated OK incorrectly!!\n"); + st_err++; + } + else + printf("\tECDSA key generation failed as expected.\n"); + + FIPS_ec_key_free(ec); + ec = NULL; + + fail_id = -1; + fail_sub = -1; + fail_key = -1; + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + /* Induce continuous PRNG failure for DRBG */ + printf(" Testing induced failure of DRBG CPRNG test\n"); + FIPS_drbg_stick(1); + + /* Initialise a DRBG context */ + dctx = FIPS_drbg_new(NID_sha1, 0); + if (!dctx) + return 0; + for (i = 0; i < sizeof(dummy_drbg_entropy); i++) + { + dummy_drbg_entropy[i] = i & 0xff; + } + FIPS_drbg_set_callbacks(dctx, drbg_test_cb, 0, 0x10, drbg_test_cb, 0); + if (!FIPS_drbg_instantiate(dctx, dummy_drbg_entropy, 10)) + { + printf("\tDRBG instantiate error!!\n"); + st_err++; + } + if (FIPS_drbg_generate(dctx, out, sizeof(out), 0, NULL, 0)) + { + printf("\tDRBG continuous PRNG OK incorrectly!!\n"); + st_err++; + } + else + printf("\tDRBG continuous PRNG failed as expected\n"); + FIPS_drbg_stick(0); + + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + + FIPS_drbg_free(dctx); + + /* Induce continuous PRNG failure for DRBG entropy source*/ + printf(" Testing induced failure of DRBG entropy CPRNG test\n"); + + /* Initialise a DRBG context */ + dctx = FIPS_drbg_new(NID_sha1, 0); + if (!dctx) + return 0; + for (i = 0; i < sizeof(dummy_drbg_entropy); i++) + { + dummy_drbg_entropy[i] = i & 0xf; + } + FIPS_drbg_set_callbacks(dctx, drbg_test_cb, 0, 0x10, drbg_test_cb, 0); + if (FIPS_drbg_instantiate(dctx, dummy_drbg_entropy, 10)) + { + printf("\tDRBG continuous PRNG entropy OK incorrectly!!\n"); + st_err++; + } + else + printf("\tDRBG continuous PRNG entropy failed as expected\n"); + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + FIPS_drbg_free(dctx); + + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + + printf(" Testing induced failure of X9.31 CPRNG test\n"); + FIPS_x931_stick(1); + if (!FIPS_x931_set_key(dummy_drbg_entropy, 32)) + { + printf("\tError initialiasing X9.31 PRNG\n"); + st_err++; + } + if (!FIPS_x931_seed(dummy_drbg_entropy + 32, 16)) + { + printf("\tError seeding X9.31 PRNG\n"); + st_err++; + } + if (FIPS_x931_bytes(out, 10) > 0) + { + printf("\tX9.31 continuous PRNG failure OK incorrectly!!\n"); + st_err++; + } + else + printf("\tX9.31 continuous PRNG failed as expected\n"); + FIPS_x931_stick(0); + + /* Leave FIPS mode to clear error */ + fips_module_mode_set_verbose(0, NULL); + /* Enter FIPS mode successfully */ + if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS)) + { + printf("\tError entering FIPS mode\n"); + st_err++; + } + + printf(" Testing operation failure with DRBG entropy failure\n"); + + /* Generate DSA key for later use */ + if (DSA_generate_key(dsa)) + printf("\tDSA key generated OK as expected.\n"); + else + { + printf("\tDSA key generation FAILED!!\n"); + st_err++; + } + + /* Initialise default DRBG context */ + defctx = FIPS_get_default_drbg(); + if (!defctx) + return 0; + if (!FIPS_drbg_init(defctx, NID_sha512, 0)) + return 0; + /* Set entropy failure callback */ + FIPS_drbg_set_callbacks(defctx, drbg_fail_cb, 0, 0x10, drbg_test_cb, 0); + if (FIPS_drbg_instantiate(defctx, dummy_drbg_entropy, 10)) + { + printf("\tDRBG entropy fail OK incorrectly!!\n"); + st_err++; + } + else + printf("\tDRBG entropy fail failed as expected\n"); + + if (FIPS_dsa_sign(dsa, dummy_drbg_entropy, 5, EVP_sha256())) + { + printf("\tDSA signing OK incorrectly!!\n"); + st_err++; + } + else + printf("\tDSA signing failed as expected\n"); + + ec = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); + + if (!ec) + return 0; + + if (EC_KEY_generate_key(ec)) + { + printf("\tECDSA key generated OK incorrectly!!\n"); + st_err++; + } + else + printf("\tECDSA key generation failed as expected.\n"); + + printf(" Induced failure test completed with %d errors\n", st_err); + post_quiet = 0; + no_err = 0; + BN_free(bn); + FIPS_rsa_free(rsa); + FIPS_dsa_free(dsa); + FIPS_ec_key_free(ec); + if (st_err) + return 0; + return 1; + } + +#ifdef FIPS_ALGVS +int fips_test_suite_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { + char **args = argv + 1; int bad_rsa = 0, bad_dsa = 0; int do_rng_stick = 0; int do_drbg_stick = 0; int no_exit = 0; - int no_dh = 0; + int no_dh = 0, no_drbg = 0; char *pass = FIPS_AUTH_USER_PASS; + int fullpost = 0, fullerr = 0; FIPS_post_set_callback(post_cb); +#if (defined(__arm__) || defined(__aarch64__)) + extern unsigned int OPENSSL_armcap_P; + if (0 == OPENSSL_armcap_P) + fprintf(stderr, "Optimizations disabled\n"); +#endif + printf("\tFIPS-mode test application\n"); printf("\t%s\n\n", FIPS_module_version_text()); - if (argv[1]) { + while(*args) { /* Corrupted KAT tests */ - if (!strcmp(argv[1], "integrity")) { + if (!strcmp(*args, "integrity")) { fail_id = FIPS_TEST_INTEGRITY; - } else if (!strcmp(argv[1], "aes")) { + } else if (!strcmp(*args, "aes")) { fail_id = FIPS_TEST_CIPHER; fail_sub = NID_aes_128_ecb; - } else if (!strcmp(argv[1], "aes-ccm")) { + } else if (!strcmp(*args, "aes-ccm")) { fail_id = FIPS_TEST_CCM; - } else if (!strcmp(argv[1], "aes-gcm")) { + } else if (!strcmp(*args, "aes-gcm")) { fail_id = FIPS_TEST_GCM; - } else if (!strcmp(argv[1], "aes-xts")) { + } else if (!strcmp(*args, "aes-xts")) { fail_id = FIPS_TEST_XTS; - } else if (!strcmp(argv[1], "des")) { + } else if (!strcmp(*args, "des")) { fail_id = FIPS_TEST_CIPHER; fail_sub = NID_des_ede3_ecb; - } else if (!strcmp(argv[1], "dsa")) { + } else if (!strcmp(*args, "dsa")) { fail_id = FIPS_TEST_SIGNATURE; fail_key = EVP_PKEY_DSA; } else if (!strcmp(argv[1], "ecdh")) { fail_id = FIPS_TEST_ECDH; - } else if (!strcmp(argv[1], "ecdsa")) { + } else if (!strcmp(*args, "ecdsa")) { fail_id = FIPS_TEST_SIGNATURE; fail_key = EVP_PKEY_EC; - } else if (!strcmp(argv[1], "rsa")) { + } else if (!strcmp(*args, "rsa")) { fail_id = FIPS_TEST_SIGNATURE; fail_key = EVP_PKEY_RSA; - } else if (!strcmp(argv[1], "rsakey")) { + } else if (!strcmp(*args, "rsakey")) { printf("RSA key generation and signature validation with corrupted key...\n"); bad_rsa = 1; no_exit = 1; - } else if (!strcmp(argv[1], "rsakeygen")) { + } else if (!strcmp(*args, "rsakeygen")) { fail_id = FIPS_TEST_PAIRWISE; fail_key = EVP_PKEY_RSA; no_exit = 1; - } else if (!strcmp(argv[1], "dsakey")) { + } else if (!strcmp(*args, "dsakey")) { printf("DSA key generation and signature validation with corrupted key...\n"); bad_dsa = 1; no_exit = 1; - } else if (!strcmp(argv[1], "dsakeygen")) { + } else if (!strcmp(*args, "dsakeygen")) { fail_id = FIPS_TEST_PAIRWISE; fail_key = EVP_PKEY_DSA; no_exit = 1; - } else if (!strcmp(argv[1], "sha1")) { + } else if (!strcmp(*args, "sha1")) { fail_id = FIPS_TEST_DIGEST; - } else if (!strcmp(argv[1], "hmac")) { + } else if (!strcmp(*args, "hmac")) { fail_id = FIPS_TEST_HMAC; - } else if (!strcmp(argv[1], "cmac")) { + } else if (!strcmp(*args, "cmac")) { fail_id = FIPS_TEST_CMAC; - } else if (!strcmp(argv[1], "drbg")) { + } else if (!strcmp(*args, "drbg")) { fail_id = FIPS_TEST_DRBG; } else if (!strcmp(argv[1], "rng")) { fail_id = FIPS_TEST_X931; - } else if (!strcmp(argv[1], "nodh")) { + } else if (!strcmp(*args, "nodrbg")) { + no_drbg = 1; + no_exit = 1; + } else if (!strcmp(*args, "nodh")) { no_dh = 1; no_exit = 1; - } else if (!strcmp(argv[1], "post")) { + } else if (!strcmp(*args, "post")) { fail_id = -1; - } else if (!strcmp(argv[1], "rngstick")) { + } else if (!strcmp(*args, "rngstick")) { do_rng_stick = 1; no_exit = 1; printf("RNG test with stuck continuous test...\n"); - } else if (!strcmp(argv[1], "drbgentstick")) { + } else if (!strcmp(*args, "drbgentstick")) { do_entropy_stick(); - } else if (!strcmp(argv[1], "drbgstick")) { + } else if (!strcmp(*args, "drbgstick")) { do_drbg_stick = 1; no_exit = 1; printf("DRBG test with stuck continuous test...\n"); - } else if (!strcmp(argv[1], "user")) { + } else if (!strcmp(*args, "user")) { pass = FIPS_AUTH_USER_PASS; - } else if (!strcmp(argv[1], "officer")) { + } else if (!strcmp(*args, "officer")) { pass = FIPS_AUTH_OFFICER_PASS; - } else if (!strcmp(argv[1], "badpass")) { + } else if (!strcmp(*args, "badpass")) { pass = "bad invalid password"; - } else if (!strcmp(argv[1], "nopass")) { + } else if (!strcmp(*args, "nopass")) { pass = ""; + } else if (!strcmp(*args, "fullpost")) { + fullpost = 1; + no_exit = 1; + } else if (!strcmp(*args, "fullerr")) { + fullerr = 1; + no_exit = 1; + } else if (!strcmp(*args, "verbose")) { + verbose = 1; + no_exit = 1; } else { - printf("Bad argument \"%s\"\n", argv[1]); - exit(1); + printf("Bad argument \"%s\"\n", *args); + return 1; } - if (!no_exit) { + args++; + } + + if ((argc != 1) && !no_exit) { fips_algtest_init_nofips(); - if (!FIPS_module_mode_set(1, pass)) { + if (!fips_module_mode_set_verbose(1, pass)) { printf("Power-up self test failed\n"); - exit(1); + return 1; } printf("Power-up self test successful\n"); - exit(0); - } + return 0; } fips_algtest_init_nofips(); @@ -1114,13 +1739,15 @@ int main(int argc,char **argv) /* Power-up self test */ ERR_clear_error(); - test_msg("2. Automatic power-up self test", FIPS_module_mode_set(1, pass)); + test_msg("2a. Automatic power-up self test", fips_module_mode_set_verbose(1, pass)); if (!FIPS_module_mode()) - exit(1); + return 1; if (do_drbg_stick) - FIPS_drbg_stick(); + FIPS_drbg_stick(1); if (do_rng_stick) - FIPS_x931_stick(); + FIPS_x931_stick(1); + + test_msg("2b. On demand self test", FIPS_selftest()); /* AES encryption/decryption */ @@ -1216,9 +1843,18 @@ int main(int argc,char **argv) : Fail("failed INCORRECTLY!") ); printf("12. DRBG generation check...\n"); - printf("\t%s\n", do_drbg_all() ? "successful as expected" + if (no_drbg) + printf("\tskipped\n"); + else + printf("\t%s\n", do_drbg_all() ? "successful as expected" : Fail("failed INCORRECTLY!") ); + test_msg("13. ECDSA key generation and signature validation", + FIPS_ecdsa_test()); + + printf("14. Induced test failure check...\n"); + printf("\t%s\n", do_fail_all(fullpost, fullerr) ? "successful as expected" + : Fail("failed INCORRECTLY!") ); printf("\nAll tests completed with %d errors\n", Error); return Error ? 1 : 0; } diff --git a/fips/fips_utl.h b/fips/fips_utl.h index 1ed133c5c9..491bc2ace9 100644 --- a/fips/fips_utl.h +++ b/fips/fips_utl.h @@ -47,6 +47,9 @@ * */ +#ifndef FIPS_UTL_H +#define FIPS_UTL_H + #define OPENSSL_FIPSAPI #include @@ -487,3 +490,5 @@ int fips_strcasecmp(const char *str1, const char *str2) return fips_strncasecmp(str1, str2, (size_t)-1); } + +#endif diff --git a/fips/fipsalgtest.pl b/fips/fipsalgtest.pl index cd6ba8c116..2e31335ae9 100644 --- a/fips/fipsalgtest.pl +++ b/fips/fipsalgtest.pl @@ -495,6 +495,7 @@ my $onedir = 0; my $filter = ""; my $tvdir; my $tprefix; +my $sfprefix = ""; my $debug = 0; my $quiet = 0; my $notest = 0; @@ -513,29 +514,29 @@ my $mkcmd = "mkdir"; my $cmpall = 0; my %fips_enabled = ( - dsa => 1, - dsa2 => 2, + "dsa" => 1, + "dsa2" => 2, "dsa-pqgver" => 2, - ecdsa => 2, - rsa => 1, + "ecdsa" => 2, + "rsa" => 1, "rsa-pss0" => 2, "rsa-pss62" => 1, - sha => 1, - hmac => 1, - cmac => 2, + "sha" => 1, + "hmac" => 1, + "cmac" => 2, "rand-aes" => 1, "rand-des2" => 0, - aes => 1, + "aes" => 1, "aes-cfb1" => 2, - des3 => 1, + "des3" => 1, "des3-cfb1" => 2, - drbg => 2, + "drbg" => 2, "aes-ccm" => 2, "aes-xts" => 2, "aes-gcm" => 2, - dh => 0, - ecdh => 2, - v2 => 1, + "dh" => 0, + "ecdh" => 2, + "v2" => 1, ); foreach (@ARGV) { @@ -615,6 +616,9 @@ foreach (@ARGV) { elsif (/--script-tprefix=(.*)$/) { $stprefix = $1; } + elsif (/--script-fprefix=(.*)$/) { + $sfprefix = $1; + } elsif (/--mkdir=(.*)$/) { $mkcmd = $1; } @@ -1017,6 +1021,10 @@ END $out =~ s|/req/(\S+)\.req|/$rspdir/$1.rsp|; my $outdir = $out; $outdir =~ s|/[^/]*$||; + if ( !-d $outdir && ($outfile eq "" || $minimal_script)) { + print STDERR "DEBUG: Creating directory $outdir\n" if $debug; + mkdir($outdir) || die "Can't create directory $outdir"; + } if ($outfile ne "") { if ($win32) { $outdir =~ tr|/|\\|; @@ -1039,12 +1047,9 @@ END } $lastdir = $outdir; } - } elsif ( !-d $outdir ) { - print STDERR "DEBUG: Creating directory $outdir\n" if $debug; - mkdir($outdir) || die "Can't create directory $outdir"; } } - my $cmd = "$tcmd \"$req\" \"$out\""; + my $cmd = "$tcmd \"$sfprefix$req\" \"$sfprefix$out\""; print STDERR "DEBUG: running test $tname\n" if ( $debug && !$verify ); if ($outfile ne "") { if ($minimal_script) { diff --git a/fips/fipsld b/fips/fipsld index 6184e2064e..62565fd032 100755 --- a/fips/fipsld +++ b/fips/fipsld @@ -1,6 +1,6 @@ #!/bin/sh -e # -# Copyright (c) 2005-2007 The OpenSSL Project. +# Copyright (c) 2005-2011 The OpenSSL Project. # # Depending on output file name, the script either embeds fingerprint # into libcrypto.so or static application. "Static" refers to static @@ -127,12 +127,15 @@ lib*|*.dll) # must be linking a shared lib... "${PREMAIN_C}" \ ${_WL_PREMAIN} "$@" - # generate signature... - if [ -z "${FIPS_SIG}" ]; then - SIG=`"${PREMAIN_DSO}" "${TARGET}"` - else - SIG=`"${FIPS_SIG}" -dso "${TARGET}"` + if [ "x${FIPS_SIG}" != "x" ]; then + # embed signature + "${FIPS_SIG}" "${TARGET}" + [ $? -ne 42 ] && exit $? fi + + # generate signature... + SIG=`"${PREMAIN_DSO}" "${TARGET}"` + /bin/rm -f "${TARGET}" if [ -z "${SIG}" ]; then echo "unable to collect signature"; exit 1 @@ -172,12 +175,15 @@ lib*|*.dll) # must be linking a shared lib... "${PREMAIN_C}" \ ${_WL_PREMAIN} "$@" - # generate signature... - if [ -z "${FIPS_SIG}" ]; then - SIG=`"${TARGET}"` - else - SIG=`"${FIPS_SIG}" -exe "${TARGET}"` + if [ "x${FIPS_SIG}" != "x" ]; then + # embed signature + "${FIPS_SIG}" "${TARGET}" + [ $? -ne 42 ] && exit $? fi + + # generate signature... + SIG=`"${TARGET}"` + /bin/rm -f "${TARGET}" if [ -z "${SIG}" ]; then echo "unable to collect signature"; exit 1 diff --git a/fips/fipssyms.h b/fips/fipssyms.h index 5b1e188785..8f04eb9dcf 100644 --- a/fips/fipssyms.h +++ b/fips/fipssyms.h @@ -589,6 +589,7 @@ #define AES_encrypt fips_aes_encrypt #define AES_set_decrypt_key fips_aes_set_decrypt_key #define AES_set_encrypt_key fips_aes_set_encrypt_key +#define AES_ctr32_encrypt fips_aes_ctr32_encrypt #define BN_from_montgomery fips_bn_from_montgomery #define BN_num_bits_word FIPS_bn_num_bits_word #define DES_SPtrans fips_des_sptrans @@ -667,6 +668,67 @@ #define bn_mul_mont_gather5 fips_bn_mul_mont_gather5 #define bn_scatter5 fips_bn_scatter5 #define bn_gather5 fips_bn_gather5 +#define _armv8_aes_probe _fips_armv8_aes_probe +#define _armv8_pmull_probe _fips_armv8_pmull_probe +#define _armv8_sha1_probe _fips_armv8_sha1_probe +#define _armv8_sha256_probe _fips_armv8_sha256_probe +#define aes_v8_encrypt fips_aes_v8_encrypt +#define aes_v8_decrypt fips_aes_v8_decrypt +#define aes_v8_set_encrypt_key fips_aes_v8_set_encrypt_key +#define aes_v8_set_decrypt_key fips_aes_v8_set_decrypt_key +#define aes_v8_cbc_encrypt fips_aes_v8_cbc_encrypt +#define aes_v8_ctr32_encrypt_blocks fips_aes_v8_ctr32_encrypt_blocks +#define gcm_init_v8 fips_gcm_init_v8 +#define gcm_gmult_v8 fips_gcm_gmult_v8 +#define gcm_ghash_v8 fips_gcm_ghash_v8 +#if defined(__APPLE__) && __ASSEMBLER__ +#define _OPENSSL_armcap_P _fips_openssl_armcap_P +#define __armv7_neon_probe __fips_armv7_neon_probe +#define __armv7_tick __fips_armv7_tick +#define __armv8_aes_probe __fips_armv8_aes_probe +#define __armv8_pmull_probe __fips_armv8_pmull_probe +#define __armv8_sha1_probe __fips_armv8_sha1_probe +#define __armv8_sha256_probe __fips_armv8_sha256_probe +#define _aes_v8_encrypt _fips_aes_v8_encrypt +#define _aes_v8_decrypt _fips_aes_v8_decrypt +#define _aes_v8_set_encrypt_key _fips_aes_v8_set_encrypt_key +#define _aes_v8_set_decrypt_key _fips_aes_v8_set_decrypt_key +#define _aes_v8_cbc_encrypt _fips_aes_v8_cbc_encrypt +#define _aes_v8_ctr32_encrypt_blocks _fips_aes_v8_ctr32_encrypt_blocks +#define _gcm_init_v8 _fips_gcm_init_v8 +#define _gcm_gmult_v8 _fips_gcm_gmult_v8 +#define _gcm_ghash_v8 _fips_gcm_ghash_v8 +#define _sha1_block_data_order _fips_sha1_block_data_order +#define _sha256_block_data_order _fips_sha256_block_data_order +#define _sha512_block_data_order _fips_sha512_block_data_order +#define _AES_decrypt _fips_aes_decrypt +#define _AES_encrypt _fips_aes_encrypt +#define _AES_set_decrypt_key _fips_aes_set_decrypt_key +#define _AES_set_encrypt_key _fips_aes_set_encrypt_key +#define _gcm_gmult_4bit _fips_gcm_gmult_4bit +#define _gcm_ghash_4bit _fips_gcm_ghash_4bit +#define _gcm_gmult_neon _fips_gcm_gmult_neon +#define _gcm_ghash_neon _fips_gcm_ghash_neon +#define _bn_GF2m_mul_2x2 _fips_bn_GF2m_mul_2x2 +#define _OPENSSL_cleanse _FIPS_openssl_cleanse +#endif +#define aes_p8_encrypt fips_aes_p8_encrypt +#define aes_p8_decrypt fips_aes_p8_decrypt +#define aes_p8_set_encrypt_key fips_aes_p8_set_encrypt_key +#define aes_p8_set_decrypt_key fips_aes_p8_set_decrypt_key +#define aes_p8_cbc_encrypt fips_aes_p8_cbc_encrypt +#define aes_p8_ctr32_encrypt_blocks fips_aes_p8_ctr32_encrypt_blocks +#define aes_p8_xts_encrypt fips_aes_p8_xts_encrypt +#define aes_p8_xts_decrypt fips_aes_p8_xts_decrypt +#define gcm_init_p8 fips_gcm_init_p8 +#define gcm_gmult_p8 fips_gcm_gmult_p8 +#define gcm_ghash_p8 fips_gcm_ghash_p8 +#define sha256_block_p8 fips_sha256_block_p8 +#define sha512_block_p8 fips_sha512_block_p8 +#define sha256_block_ppc fips_sha256_block_ppc +#define sha512_block_ppc fips_sha512_block_ppc +#define OPENSSL_ppccap_P fips_openssl_ppccap_p +#define OPENSSL_crypto207_probe fips_openssl_crypto207_probe #if defined(_MSC_VER) # pragma const_seg("fipsro$b") diff --git a/fips/hmac/fips_hmactest.c b/fips/hmac/fips_hmactest.c index 07c18bfdfa..da9c8d7926 100644 --- a/fips/hmac/fips_hmactest.c +++ b/fips/hmac/fips_hmactest.c @@ -85,7 +85,11 @@ static int print_hmac(const EVP_MD *md, FILE *out, unsigned char *Key, int Klen, unsigned char *Msg, int Msglen, int Tlen); +#ifdef FIPS_ALGVS +int fips_hmactest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; diff --git a/fips/rand/fips_drbg_lib.c b/fips/rand/fips_drbg_lib.c index 1596977fd5..ee162d05eb 100644 --- a/fips/rand/fips_drbg_lib.c +++ b/fips/rand/fips_drbg_lib.c @@ -154,6 +154,8 @@ static size_t fips_get_entropy(DRBG_CTX *dctx, unsigned char **pout, { unsigned char *tout, *p; size_t bl = dctx->entropy_blocklen, rv; + if (!dctx->get_entropy) + return 0; if (dctx->xflags & DRBG_FLAG_TEST || !bl) return dctx->get_entropy(dctx, pout, entropy, min_len, max_len); rv = dctx->get_entropy(dctx, &tout, entropy + bl, @@ -241,7 +243,7 @@ int FIPS_drbg_instantiate(DRBG_CTX *dctx, goto end; } - if (dctx->max_nonce > 0) + if (dctx->max_nonce > 0 && dctx->get_nonce) { noncelen = dctx->get_nonce(dctx, &nonce, dctx->strength / 2, @@ -544,9 +546,9 @@ void FIPS_drbg_set_reseed_interval(DRBG_CTX *dctx, int interval) static int drbg_stick = 0; -void FIPS_drbg_stick(void) +void FIPS_drbg_stick(int onoff) { - drbg_stick = 1; + drbg_stick = onoff; } /* Continuous DRBG utility function */ diff --git a/fips/rand/fips_drbg_selftest.c b/fips/rand/fips_drbg_selftest.c index ee0561bcbe..a787323d6d 100644 --- a/fips/rand/fips_drbg_selftest.c +++ b/fips/rand/fips_drbg_selftest.c @@ -582,7 +582,6 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td) } dctx->iflags &= ~DRBG_FLAG_NOERR; - if (!FIPS_drbg_uninstantiate(dctx)) { FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR); @@ -617,28 +616,20 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td) goto err; } - /* Explicit reseed tests */ - - /* Test explicit reseed with too large additional input */ - if (!do_drbg_init(dctx, td, &t)) - goto err; - - dctx->iflags |= DRBG_FLAG_NOERR; - - if (FIPS_drbg_reseed(dctx, td->adin, dctx->max_adin + 1) > 0) + dctx->iflags &= ~DRBG_FLAG_NOERR; + if (!FIPS_drbg_uninstantiate(dctx)) { - FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_ADDITIONAL_INPUT_ERROR_UNDETECTED); + FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR); goto err; } - /* Test explicit reseed with entropy source failure */ - /* Check prediction resistance request fails if entropy source * failure. */ t.entlen = 0; + dctx->iflags |= DRBG_FLAG_NOERR; if (FIPS_drbg_generate(dctx, randout, td->katlen, 1, td->adin, td->adinlen)) { @@ -680,6 +671,13 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td) goto err; } + dctx->iflags &= ~DRBG_FLAG_NOERR; + if (!FIPS_drbg_uninstantiate(dctx)) + { + FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR); + goto err; + } + /* Explicit reseed tests */ /* Test explicit reseed with too large additional input */ @@ -696,11 +694,6 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td) /* Test explicit reseed with entropy source failure */ - if (!do_drbg_init(dctx, td, &t)) - goto err; - - dctx->iflags |= DRBG_FLAG_NOERR; - t.entlen = 0; if (FIPS_drbg_reseed(dctx, td->adin, td->adinlen) > 0) diff --git a/fips/rand/fips_drbgvs.c b/fips/rand/fips_drbgvs.c index 4d3f0cfee0..214e3c340a 100644 --- a/fips/rand/fips_drbgvs.c +++ b/fips/rand/fips_drbgvs.c @@ -76,7 +76,7 @@ int main(int argc, char **argv) #include "fips_utl.h" -static int parse_md(char *str) +static int dparse_md(char *str) { switch(atoi(str + 5)) { @@ -115,7 +115,7 @@ static int parse_ec(char *str) curve_nid = NID_secp521r1; else return NID_undef; - md_nid = parse_md(md); + md_nid = dparse_md(md); if (md_nid == NID_undef) return NID_undef; return (curve_nid << 16) | md_nid; @@ -170,17 +170,19 @@ static size_t test_nonce(DRBG_CTX *dctx, unsigned char **pout, return t->noncelen; } - - +#ifdef FIPS_ALGVS +int fips_drbgvs_main(int argc,char **argv) +#else int main(int argc,char **argv) +#endif { - FILE *in, *out; + FILE *in = NULL, *out = NULL; DRBG_CTX *dctx = NULL; TEST_ENT t; int r, nid = 0; int pr = 0; char buf[2048], lbuf[2048]; - unsigned char randout[2048]; + unsigned char *randout = NULL; char *keyword = NULL, *value = NULL; unsigned char *ent = NULL, *nonce = NULL, *pers = NULL, *adin = NULL; @@ -240,7 +242,7 @@ int main(int argc,char **argv) } if (strlen(buf) > 4 && !strncmp(buf, "[SHA-", 5)) { - nid = parse_md(buf); + nid = dparse_md(buf); if (nid == NID_undef) exit(1); if (drbg_type == DRBG_HMAC) @@ -296,6 +298,8 @@ int main(int argc,char **argv) else exit(1); } + if (!strcmp(keyword, "[ReturnedBitsLen")) + randoutlen = atoi(value) / 8; if (!strcmp(keyword, "EntropyInput")) { @@ -325,7 +329,11 @@ int main(int argc,char **argv) FIPS_drbg_set_callbacks(dctx, test_entropy, 0, 0, test_nonce, 0); FIPS_drbg_set_app_data(dctx, &t); - randoutlen = (int)FIPS_drbg_get_blocklength(dctx); + if (randoutlen == 0) + randoutlen = (int)FIPS_drbg_get_blocklength(dctx); + if (randout) + OPENSSL_free(randout); + randout = OPENSSL_malloc(randoutlen); r = FIPS_drbg_instantiate(dctx, pers, perslen); if (!r) { @@ -404,6 +412,12 @@ int main(int argc,char **argv) } } + if (randout) + OPENSSL_free(randout); + if (in && in != stdin) + fclose(in); + if (out && out != stdout) + fclose(out); return 0; } diff --git a/fips/rand/fips_rand.c b/fips/rand/fips_rand.c index cb9184e1f7..9904d8aa6f 100644 --- a/fips/rand/fips_rand.c +++ b/fips/rand/fips_rand.c @@ -66,7 +66,7 @@ #include #include #include -#if !(defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS)) +#if !(defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYSNAME_DSPBIOS)) # include #endif #if defined(OPENSSL_SYS_VXWORKS) @@ -114,9 +114,9 @@ static FIPS_PRNG_CTX sctx; static int fips_prng_fail = 0; -void FIPS_x931_stick(void) +void FIPS_x931_stick(int onoff) { - fips_prng_fail = 1; + fips_prng_fail = onoff; } static void fips_rand_prng_reset(FIPS_PRNG_CTX *ctx) @@ -232,8 +232,13 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr) { #ifdef OPENSSL_SYS_WIN32 FILETIME ft; +#ifdef _WIN32_WCE + SYSTEMTIME t; +#endif #elif defined(OPENSSL_SYS_VXWORKS) struct timespec ts; +#elif defined(OPENSSL_SYSNAME_DSPBIOS) + unsigned long long TSC, OPENSSL_rdtsc(); #else struct timeval tv; #endif @@ -243,7 +248,12 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr) #endif #ifdef OPENSSL_SYS_WIN32 +#ifdef _WIN32_WCE + GetSystemTime(&t); + SystemTimeToFileTime(&t, &ft); +#else GetSystemTimeAsFileTime(&ft); +#endif buf[0] = (unsigned char) (ft.dwHighDateTime & 0xff); buf[1] = (unsigned char) ((ft.dwHighDateTime >> 8) & 0xff); buf[2] = (unsigned char) ((ft.dwHighDateTime >> 16) & 0xff); @@ -262,6 +272,16 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr) buf[5] = (unsigned char) ((ts.tv_nsec >> 8) & 0xff); buf[6] = (unsigned char) ((ts.tv_nsec >> 16) & 0xff); buf[7] = (unsigned char) ((ts.tv_nsec >> 24) & 0xff); +#elif defined(OPENSSL_SYSNAME_DSPBIOS) + TSC = OPENSSL_rdtsc(); + buf[0] = (unsigned char) (TSC & 0xff); + buf[1] = (unsigned char) ((TSC >> 8) & 0xff); + buf[2] = (unsigned char) ((TSC >> 16) & 0xff); + buf[3] = (unsigned char) ((TSC >> 24) & 0xff); + buf[4] = (unsigned char) ((TSC >> 32) & 0xff); + buf[5] = (unsigned char) ((TSC >> 40) & 0xff); + buf[6] = (unsigned char) ((TSC >> 48) & 0xff); + buf[7] = (unsigned char) ((TSC >> 56) & 0xff); #else gettimeofday(&tv,NULL); buf[0] = (unsigned char) (tv.tv_sec & 0xff); diff --git a/fips/rand/fips_rand_selftest.c b/fips/rand/fips_rand_selftest.c index bafce719ca..ec949cbdbb 100644 --- a/fips/rand/fips_rand_selftest.c +++ b/fips/rand/fips_rand_selftest.c @@ -129,15 +129,16 @@ static AES_PRNG_TV aes_256_tv = static int do_x931_test(unsigned char *key, int keylen, AES_PRNG_TV *tv) { - unsigned char R[16]; + unsigned char R[16], V[16]; int rv = 1; + memcpy(V, tv->V, sizeof(V)); if (!FIPS_x931_set_key(key, keylen)) return 0; if (!fips_post_started(FIPS_TEST_X931, keylen, NULL)) return 1; if (!fips_post_corrupt(FIPS_TEST_X931, keylen, NULL)) - tv->V[0]++; - FIPS_x931_seed(tv->V, 16); + V[0]++; + FIPS_x931_seed(V, 16); FIPS_x931_set_dt(tv->DT); FIPS_x931_bytes(R, 16); if (memcmp(R, tv->R, 16)) diff --git a/fips/rand/fips_rngvs.c b/fips/rand/fips_rngvs.c index ac0a526573..9e1f070602 100644 --- a/fips/rand/fips_rngvs.c +++ b/fips/rand/fips_rngvs.c @@ -198,7 +198,11 @@ static void mct(FILE *in, FILE *out) } } -int main(int argc,char **argv) +#ifdef FIPS_ALGVS +int fips_rngvs_main(int argc, char **argv) +#else +int main(int argc, char **argv) +#endif { FILE *in, *out; if (argc == 4) diff --git a/fips/rsa/fips_rsa_sign.c b/fips/rsa/fips_rsa_sign.c index 013333e0b4..4956971f04 100644 --- a/fips/rsa/fips_rsa_sign.c +++ b/fips/rsa/fips_rsa_sign.c @@ -288,8 +288,11 @@ int FIPS_rsa_sign_digest(RSA *rsa, const unsigned char *md, int md_len, *siglen=j; } psserr: - OPENSSL_cleanse(sbuf, i); - OPENSSL_free(sbuf); + if (sbuf) + { + OPENSSL_cleanse(sbuf, i); + OPENSSL_free(sbuf); + } return ret; } @@ -442,4 +445,33 @@ err: return(ret); } +int FIPS_rsa_sign(RSA *rsa, const unsigned char *msg, int msglen, + const EVP_MD *mhash, int rsa_pad_mode, int saltlen, + const EVP_MD *mgf1Hash, + unsigned char *sigret, unsigned int *siglen) + { + unsigned int md_len, rv; + unsigned char md[EVP_MAX_MD_SIZE]; + FIPS_digest(msg, msglen, md, &md_len, mhash); + rv = FIPS_rsa_sign_digest(rsa, md, md_len, mhash, rsa_pad_mode, + saltlen, mgf1Hash, sigret, siglen); + OPENSSL_cleanse(md, md_len); + return rv; + } + + +int FIPS_rsa_verify(RSA *rsa, const unsigned char *msg, int msglen, + const EVP_MD *mhash, int rsa_pad_mode, int saltlen, + const EVP_MD *mgf1Hash, + const unsigned char *sigbuf, unsigned int siglen) + { + unsigned int md_len, rv; + unsigned char md[EVP_MAX_MD_SIZE]; + FIPS_digest(msg, msglen, md, &md_len, mhash); + rv = FIPS_rsa_verify_digest(rsa, md, md_len, mhash, rsa_pad_mode, + saltlen, mgf1Hash, sigbuf, siglen); + OPENSSL_cleanse(md, md_len); + return rv; + } + #endif diff --git a/fips/rsa/fips_rsagtest.c b/fips/rsa/fips_rsagtest.c index 78b4531398..8342f615fb 100644 --- a/fips/rsa/fips_rsagtest.c +++ b/fips/rsa/fips_rsagtest.c @@ -88,7 +88,11 @@ static int rsa_printkey1(FILE *out, RSA *rsa, static int rsa_printkey2(FILE *out, RSA *rsa, BIGNUM *Xq1, BIGNUM *Xq2, BIGNUM *Xq); +#ifdef FIPS_ALGVS +int fips_rsagtest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; diff --git a/fips/rsa/fips_rsastest.c b/fips/rsa/fips_rsastest.c index e0dbe2a0d7..a96f277e6a 100644 --- a/fips/rsa/fips_rsastest.c +++ b/fips/rsa/fips_rsastest.c @@ -85,7 +85,11 @@ static int rsa_stest(FILE *out, FILE *in, int Saltlen); static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst, unsigned char *Msg, long Msglen, int Saltlen); +#ifdef FIPS_ALGVS +int fips_rsastest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; @@ -321,15 +325,12 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst, unsigned char *sigbuf = NULL; int i, siglen, pad_mode; /* EVP_PKEY structure */ - EVP_MD_CTX ctx; siglen = RSA_size(rsa); sigbuf = OPENSSL_malloc(siglen); if (!sigbuf) goto error; - FIPS_md_ctx_init(&ctx); - if (Saltlen >= 0) pad_mode = RSA_PKCS1_PSS_PADDING; else if (Saltlen == -2) @@ -337,16 +338,10 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst, else pad_mode = RSA_PKCS1_PADDING; - if (!FIPS_digestinit(&ctx, dgst)) - goto error; - if (!FIPS_digestupdate(&ctx, Msg, Msglen)) - goto error; - if (!FIPS_rsa_sign_ctx(rsa, &ctx, pad_mode, Saltlen, NULL, + if (!FIPS_rsa_sign(rsa, Msg, Msglen, dgst, pad_mode, Saltlen, NULL, sigbuf, (unsigned int *)&siglen)) goto error; - FIPS_md_ctx_cleanup(&ctx); - fputs("S = ", out); for (i = 0; i < siglen; i++) @@ -358,6 +353,9 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst, error: + if (sigbuf) + OPENSSL_free(sigbuf); + return ret; } #endif diff --git a/fips/rsa/fips_rsavtest.c b/fips/rsa/fips_rsavtest.c index df33842691..9bfc5e688b 100644 --- a/fips/rsa/fips_rsavtest.c +++ b/fips/rsa/fips_rsavtest.c @@ -82,14 +82,18 @@ int main(int argc, char *argv[]) #include "fips_utl.h" -int rsa_test(FILE *out, FILE *in, int saltlen); +int rsa_vtest(FILE *out, FILE *in, int saltlen); static int rsa_printver(FILE *out, BIGNUM *n, BIGNUM *e, const EVP_MD *dgst, unsigned char *Msg, long Msglen, unsigned char *S, long Slen, int Saltlen); +#ifdef FIPS_ALGVS +int fips_rsavtest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; @@ -138,7 +142,7 @@ int main(int argc, char **argv) goto end; } - if (!rsa_test(out, in, Saltlen)) + if (!rsa_vtest(out, in, Saltlen)) { fprintf(stderr, "FATAL RSAVTEST file processing error\n"); goto end; @@ -159,7 +163,7 @@ int main(int argc, char **argv) #define RSA_TEST_MAXLINELEN 10240 -int rsa_test(FILE *out, FILE *in, int Saltlen) +int rsa_vtest(FILE *out, FILE *in, int Saltlen) { char *linebuf, *olinebuf, *p, *q; char *keyword, *value; @@ -319,7 +323,6 @@ static int rsa_printver(FILE *out, int ret = 0, r, pad_mode; /* Setup RSA and EVP_PKEY structures */ RSA *rsa_pubkey = NULL; - EVP_MD_CTX ctx; unsigned char *buf = NULL; rsa_pubkey = FIPS_rsa_new(); if (!rsa_pubkey) @@ -329,8 +332,6 @@ static int rsa_printver(FILE *out, if (!rsa_pubkey->n || !rsa_pubkey->e) goto error; - FIPS_md_ctx_init(&ctx); - if (Saltlen >= 0) pad_mode = RSA_PKCS1_PSS_PADDING; else if (Saltlen == -2) @@ -338,19 +339,11 @@ static int rsa_printver(FILE *out, else pad_mode = RSA_PKCS1_PADDING; - if (!FIPS_digestinit(&ctx, dgst)) - goto error; - if (!FIPS_digestupdate(&ctx, Msg, Msglen)) - goto error; - no_err = 1; - r = FIPS_rsa_verify_ctx(rsa_pubkey, &ctx, + r = FIPS_rsa_verify(rsa_pubkey, Msg, Msglen, dgst, pad_mode, Saltlen, NULL, S, Slen); no_err = 0; - - FIPS_md_ctx_cleanup(&ctx); - if (r < 0) goto error; diff --git a/fips/sha/Makefile b/fips/sha/Makefile index 9bc598301f..0878e7bf64 100644 --- a/fips/sha/Makefile +++ b/fips/sha/Makefile @@ -30,7 +30,8 @@ LIB=$(TOP)/libcrypto.a LIBSRC=fips_sha1_selftest.c LIBOBJ=fips_sha1_selftest.o -SRC= $(LIBSRC) fips_standalone_sha1.c +SRC= $(LIBSRC) +PROGS= fips_standalone_sha1.c EXHEADER= HEADER= diff --git a/fips/sha/fips_shatest.c b/fips/sha/fips_shatest.c index c14df16601..3954777a64 100644 --- a/fips/sha/fips_shatest.c +++ b/fips/sha/fips_shatest.c @@ -86,7 +86,11 @@ static int print_dgst(const EVP_MD *md, FILE *out, static int print_monte(const EVP_MD *md, FILE *out, unsigned char *Seed, int SeedLen); +#ifdef FIPS_ALGVS +int fips_shatest_main(int argc, char **argv) +#else int main(int argc, char **argv) +#endif { FILE *in = NULL, *out = NULL; diff --git a/fips/utl/fips_enc.c b/fips/utl/fips_enc.c index 1358b1f4a4..13ac4ac9f5 100644 --- a/fips/utl/fips_enc.c +++ b/fips/utl/fips_enc.c @@ -208,6 +208,7 @@ int FIPS_cipherinit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, break; case EVP_CIPH_CTR_MODE: + ctx->num = 0; /* Don't reuse IV for CTR mode */ if(iv) memcpy(ctx->iv, iv, M_EVP_CIPHER_CTX_iv_length(ctx)); diff --git a/iOS/Makefile b/iOS/Makefile new file mode 100644 index 0000000000..db26da6406 --- /dev/null +++ b/iOS/Makefile @@ -0,0 +1,76 @@ +# +# OpenSSL/iOS/Makefile +# + +DIR= iOS +TOP= .. +CC= cc +INCLUDES= -I$(TOP) -I$(TOP)/include +CFLAG= -g -static +MAKEFILE= Makefile +PERL= perl +RM= rm -f + +EXE=incore_macho + +CFLAGS= $(INCLUDES) $(CFLAG) + +top: + @$(MAKE) -f $(TOP)/Makefile reflect THIS=exe + +exe: fips_algvs.app/fips_algvs + +incore_macho: incore_macho.c $(TOP)/crypto/sha/sha1dgst.c + $(HOSTCC) $(HOSTCFLAGS) -I$(TOP)/include -I$(TOP)/crypto -o $@ incore_macho.c $(TOP)/crypto/sha/sha1dgst.c + +fips_algvs.app/fips_algvs: $(TOP)/test/fips_algvs.c $(TOP)/fips/fipscanister.o fopen.m incore_macho + FIPS_SIG=./incore_macho \ + $(TOP)/fips/fipsld $(CFLAGS) -I$(TOP)/fips -o $@ \ + $(TOP)/test/fips_algvs.c $(TOP)/fips/fipscanister.o \ + fopen.m -framework Foundation || rm $@ + codesign -f -s "iPhone Developer" --entitlements fips_algvs.app/Entitlements.plist fips_algvs.app || rm $@ + +install: + @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... + @set -e; for i in $(EXE); \ + do \ + (echo installing $$i; \ + cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new; \ + chmod 755 $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new; \ + mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i ); \ + done; + @set -e; for i in $(SCRIPTS); \ + do \ + (echo installing $$i; \ + cp $$i $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new; \ + chmod 755 $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new; \ + mv -f $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i ); \ + done + +tags: + ctags $(SRC) + +tests: + +links: + +lint: + lint -DLINT $(INCLUDES) $(SRC)>fluff + +depend: + @if [ -z "$(THIS)" ]; then \ + $(MAKE) -f $(TOP)/Makefile reflect THIS=$@; \ + else \ + $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(SRC); \ + fi + +dclean: + $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new + mv -f Makefile.new $(MAKEFILE) + +clean: + rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff $(EXE) + rm -f fips_algvs.app/fips_algvs + +# DO NOT DELETE THIS LINE -- make depend depends on it. + diff --git a/iOS/fips_algvs.app/Entitlements.plist b/iOS/fips_algvs.app/Entitlements.plist new file mode 100644 index 0000000000..929c4e96d2 --- /dev/null +++ b/iOS/fips_algvs.app/Entitlements.plist @@ -0,0 +1,8 @@ + + + + + get-task-allow + + + \ No newline at end of file diff --git a/iOS/fips_algvs.app/Info.plist b/iOS/fips_algvs.app/Info.plist new file mode 100644 index 0000000000..3fd8fb4290 --- /dev/null +++ b/iOS/fips_algvs.app/Info.plist @@ -0,0 +1,24 @@ + + + + + CFBundleName + fips_algvs + CFBundleSupportedPlatforms + + iPhoneOS + + CFBundleExecutable + fips_algvs + CFBundleIdentifier + fips_algvs + CFBundleResourceSpecification + ResourceRules.plist + LSRequiresIPhoneOS + + CFBundleDisplayName + fips_algvs + CFBundleVersion + 1.0 + + diff --git a/iOS/fips_algvs.app/ResourceRules.plist b/iOS/fips_algvs.app/ResourceRules.plist new file mode 100644 index 0000000000..e7ec329dcc --- /dev/null +++ b/iOS/fips_algvs.app/ResourceRules.plist @@ -0,0 +1,25 @@ + + + + + rules + + .* + + Info.plist + + omit + + weight + 10 + + ResourceRules.plist + + omit + + weight + 100 + + + + diff --git a/iOS/fopen.m b/iOS/fopen.m new file mode 100644 index 0000000000..8d2e790845 --- /dev/null +++ b/iOS/fopen.m @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include + +static FILE *(*libc_fopen)(const char *, const char *) = NULL; + +__attribute__((constructor)) +static void pre_main(void) +{ + /* + * Pull reference to fopen(3) from libc. + */ + void *handle = dlopen("libSystem.B.dylib",RTLD_LAZY); + + if (handle) { + libc_fopen = dlsym(handle,"fopen"); + dlclose(handle); + } + + /* + * Change to Documents directory. + */ + NSString *docs = [NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES) lastObject]; + + NSFileManager *filemgr = [NSFileManager defaultManager]; + [filemgr changeCurrentDirectoryPath: docs]; + [filemgr release]; +} + +char *mkdirhier(char *path) +{ + char *slash; + struct stat buf; + + if (path[0]=='.' && path[1]=='/') path+=2; + + if ((slash = strrchr(path,'/'))) { + *slash = '\0'; + if (stat(path,&buf)==0) { + *slash = '/'; + return NULL; + } + (void)mkdirhier(path); + mkdir (path,0777); + *slash = '/'; + } + + return slash; +} +/* + * Replacement fopen(3) + */ +FILE *fopen(const char *filename, const char *mode) +{ + FILE *ret; + + if ((ret = (*libc_fopen)(filename,mode)) == NULL) { + /* + * If file is not present in Documents directory, try from Bundle. + */ + NSString *nsspath = [NSString stringWithFormat:@"%@/%s", + [[NSBundle mainBundle] bundlePath], + filename]; + + if ((ret = (*libc_fopen)([nsspath cStringUsingEncoding:NSUTF8StringEncoding],mode)) == NULL && + mode[0]=='w' && + ((filename[0]!='.' && filename[0]!='/') || + (filename[0]=='.' && filename[1]=='/')) ) { + /* + * If not present in Bundle, create directory in Documents + */ + char *path = strdup(filename), *slash; + static int once = 1; + + if ((slash = mkdirhier(path)) && once) { + /* + * For some reason iOS truncates first created file + * upon program exit, so we create one preemptively... + */ + once = 0; + strcpy(slash,"/.0"); + creat(path,0444); + } + free(path); + ret = (*libc_fopen)(filename,mode); + } + } + + return ret; +} diff --git a/iOS/incore_macho.c b/iOS/incore_macho.c new file mode 100644 index 0000000000..8842764cb0 --- /dev/null +++ b/iOS/incore_macho.c @@ -0,0 +1,1016 @@ +/* incore_macho.c */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* ==================================================================== + * Copyright 2011 Thursby Software Systems, Inc. All rights reserved. + * + * The portions of the attached software ("Contribution") is developed by + * Thursby Software Systems, Inc and is licensed pursuant to the OpenSSL + * open source license. + * + * The Contribution, originally written by Paul W. Nelson of + * Thursby Software Systems, Inc, consists of the fingerprint calculation + * required for the FIPS140 integrity check. + * + * No patent licenses or other rights except those expressly stated in + * the OpenSSL open source license shall be deemed granted or received + * expressly, by implication, estoppel, or otherwise. + * + * No assurances are provided by Thursby that the Contribution does not + * infringe the patent or other intellectual property rights of any third + * party or that the license provides you with all the necessary rights + * to make use of the Contribution. + * + * THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. IN + * ADDITION TO THE DISCLAIMERS INCLUDED IN THE LICENSE, THURSBY + * SPECIFICALLY DISCLAIMS ANY LIABILITY FOR CLAIMS BROUGHT BY YOU OR ANY + * OTHER ENTITY BASED ON INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS OR + * OTHERWISE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CPU_SUBRTPE_V7F +# define CPU_SUBRTPE_V7F ((cpu_subtype_t) 10) +#endif +/* iPhone 5 and iPad 4 (A6 Processors) */ +#ifndef CPU_SUBTYPE_ARM_V7S +# define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t) 11) +#endif +#ifndef CPU_SUBTYPE_ARM_V7K +# define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t) 12) +#endif +#ifndef CPU_SUBTYPE_ARM_V8 +# define CPU_SUBTYPE_ARM_V8 ((cpu_subtype_t) 13) +#endif + +#ifndef CPU_TYPE_ARM64 +# define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +#endif + +static int gVerbosity = 0; + +static void hexdump(const unsigned char *buf,size_t len, + unsigned long address,FILE* fp) +{ + unsigned long addr; + int i; + + addr = 0; + while(addrflags; + memcpy( sec->sectname, pSec->sectname, 16 ); + memcpy( sec->segname, pSec->segname, 16 ); + sec->addr = pSec->addr; + sec->size = pSec->size; + sec->offset = pSec->offset; + sec->align = pSec->align; + sec->reloff = pSec->reloff; + sec->nreloc = pSec->nreloc; + sec->flags = pSec->flags; + rval = pCommand + sizeof(struct section_64); + } + else + { + struct section* pSec = (struct section*)pCommand; + flags = pSec->flags; + memcpy( sec->sectname, pSec->sectname, 16 ); + memcpy( sec->segname, pSec->segname, 16 ); + sec->addr = pSec->addr; + sec->size = pSec->size; + sec->offset = pSec->offset; + sec->align = pSec->align; + sec->reloff = pSec->reloff; + sec->nreloc = pSec->nreloc; + sec->flags = pSec->flags; + rval = pCommand + sizeof(struct section); + } + if( gVerbosity > 2 ) + fprintf(stderr, " flags=%x\n", flags); + sec->segment = segment; + sec->_next = NULL; + if( macho->sec_head ) + macho->sec_tail->_next = sec; + else + macho->sec_head = sec; + macho->sec_tail = sec; + return rval; +} + +static section_t *lookup_section(macho_file_t* macho, uint32_t nsect) +{ + section_t *rval = macho->sec_head; + + if(nsect == 0) return NULL; + + while( rval != NULL && --nsect > 0 ) + rval = rval->_next; + return rval; +} + +static void *add_segment( macho_file_t *macho, void *pCommand, uint8_t is64bit ) +{ + void *rval = 0; + segment_t *seg = (segment_t *)calloc(1, sizeof(segment_t)); + + if(!seg) + return 0; + if(is64bit) + { + struct segment_command_64 *pSeg = (struct segment_command_64*)pCommand; + + memcpy( seg->segname, pSeg->segname, 16 ); + seg->vmaddr = pSeg->vmaddr; + seg->vmsize = pSeg->vmsize; + seg->fileoff = pSeg->fileoff; + seg->filesize = pSeg->filesize; + seg->maxprot = pSeg->maxprot; + seg->initprot = pSeg->initprot; + seg->nsects = pSeg->nsects; + seg->flags = pSeg->flags; + rval = pCommand + sizeof(struct segment_command_64); + } else { + struct segment_command *pSeg = (struct segment_command*)pCommand; + + memcpy( seg->segname, pSeg->segname, 16 ); + seg->vmaddr = pSeg->vmaddr; + seg->vmsize = pSeg->vmsize; + seg->fileoff = pSeg->fileoff; + seg->filesize = pSeg->filesize; + seg->maxprot = pSeg->maxprot; + seg->initprot = pSeg->initprot; + seg->nsects = pSeg->nsects; + seg->flags = pSeg->flags; + rval = pCommand + sizeof(struct segment_command); + } + seg->_next = NULL; + seg->mapped = macho->mapped + seg->fileoff; + + if( macho->seg_head ) + macho->seg_tail->_next = seg; + else + macho->seg_head = seg; + macho->seg_tail = seg; + + if( gVerbosity > 2 ) + fprintf(stderr, "Segment %s: flags=%x\n", seg->segname, seg->flags ); + + unsigned int ii; + for( ii=0; iinsects; ii++ ) + { + rval = add_section(macho, rval, is64bit, seg); + } + return rval; +} + +static const char *type_str(uint8_t n_type) +{ + static char result[16] = {}; + int idx = 0; + uint8_t stab; + + memset(result, 0, sizeof(result)); + if( n_type & N_PEXT ) + result[idx++] = 'P'; + if( n_type & N_EXT ) + result[idx++] = 'E'; + if( idx > 0 ) + result[idx++] = ':'; + switch( n_type & N_TYPE ) + { + case N_UNDF: result[idx++] = 'U'; break; + case N_ABS: result[idx++] = 'A'; break; + case N_PBUD: result[idx++] = 'P'; break; + case N_SECT: result[idx++] = 'S'; break; + case N_INDR: result[idx++] = 'I'; break; + default: result[idx++] = '*'; break; + } + stab = n_type & N_STAB; + if( stab ) + { + result[idx++] = ':'; + result[idx++] = '0'+(stab >> 5); + } + result[idx++] = 0; + return result; +} + +static symtab_entry_t *lookup_entry_by_name( macho_file_t *macho, + const char *name) +{ + symtab_entry_t *entry; + + for( entry = macho->sym_head; entry; entry = entry->_next ) + { + if(strcmp(entry->n_symbol,name)==0 && (entry->n_type & N_STAB)==0 ) + { + if( entry->section == NULL ) + { + entry->section = lookup_section( macho, entry->n_sect ); + if( entry->section ) + { + section_t* sec = entry->section; + segment_t* seg = sec->segment; + uint64_t offset = entry->n_value - seg->vmaddr; + + entry->mapped = seg->mapped+offset; + } + else + entry = 0; + } + break; + } + } + return entry; +} + +static void check_symtab(macho_file_t *macho,void *pCommand,uint8_t is64bit ) +{ + + struct symtab_command *pSym = (struct symtab_command *)pCommand; + void *pS = macho->mapped + pSym->symoff; + unsigned int ii = 0; + + /* collect symbols */ + for( ii=0; iinsyms; ii++ ) + { + struct nlist *pnlist=(struct nlist*)pS; + symtab_entry_t *entry=(symtab_entry_t*)calloc(1,sizeof(symtab_entry_t)); + + if(!entry) + { + fprintf(stderr, "out of memory!\n"); + _exit(1); + } + entry->n_strx = pnlist->n_un.n_strx; + entry->n_type = pnlist->n_type; + entry->n_sect = pnlist->n_sect; + entry->n_desc = pnlist->n_desc; + entry->section = NULL; + if(is64bit) + { + struct nlist_64 *pnlist64 = (struct nlist_64*)pS; + + entry->n_value = pnlist64->n_value; + pS += sizeof(struct nlist_64); + } + else + { + entry->n_value = pnlist->n_value; + pS += sizeof(struct nlist); + } + entry->n_symbol=(const char *)macho->mapped+pSym->stroff+entry->n_strx; + entry->_next = NULL; + if( macho->sym_head ) + macho->sym_tail->_next = entry; + else + macho->sym_head = entry; + macho->sym_tail = entry; + } + if( gVerbosity > 2 ) + { + /* dump info */ + symtab_entry_t* entry; + + for( entry = macho->sym_head; entry; entry=entry->_next ) + { + /* only do non-debug symbols */ + if( (entry->n_type & N_STAB) == 0 ) + fprintf(stderr, "%32.32s %18llx type=%s, sect=%d\n", + entry->n_symbol, entry->n_value, + type_str(entry->n_type), entry->n_sect); + } + } +} + +static int load_architecture( macho_file_t* inFile ) +{ + /* check the header */ + unsigned int ii; + void * pCurrent = inFile->mapped; + struct mach_header* header = (struct mach_header*)pCurrent; + + if( header->magic != MH_MAGIC && header->magic != MH_MAGIC_64 ) + { + fprintf(stderr, "%s is not a mach-o file\n", inFile->filename); + return -1; + } + else if( header->filetype == MH_BUNDLE ) + { + fprintf(stderr, "%s is not a mach-o executable file (filetype MH_BUNDLE, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename); + return -1; + } + else if( header->filetype == MH_DYLINKER ) + { + fprintf(stderr, "%s is not a mach-o executable file (filetype MH_DYLINKER, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename); + return -1; + } + else if( !(header->filetype == MH_EXECUTE || header->filetype == MH_DYLIB) ) + { + fprintf(stderr, "%s is not a mach-o executable file (filetype %d, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename, header->filetype); + return -1; + } + + if( gVerbosity > 1 ) + fprintf(stderr, "loading %s(%s)\n", inFile->filename, cputype(header->cputype, header->cpusubtype)); + + inFile->cpu_type = header->cputype; + inFile->cpu_subtype = header->cpusubtype; + + if( header->magic == MH_MAGIC ) + pCurrent += sizeof( struct mach_header ); + else if( header->magic == MH_MAGIC_64 ) + pCurrent += sizeof( struct mach_header_64 ); + for( ii=0; iincmds; ii++ ) + { + struct load_command* command = (struct load_command*)pCurrent; + const char * lc_name; + + switch( command->cmd ) + { + case LC_SEGMENT: + { + lc_name = "LC_SEGMENT"; + add_segment(inFile, pCurrent, header->magic == MH_MAGIC_64); + break; + } + case LC_SYMTAB: + { + lc_name = "LC_SYMTAB"; + check_symtab(inFile, pCurrent, header->magic == MH_MAGIC_64 ); + break; + } + case LC_SYMSEG: lc_name = "LC_SYMSEG"; break; + case LC_THREAD: lc_name = "LC_THREAD"; break; + case LC_UNIXTHREAD: lc_name = "LC_UNIXTHREAD"; break; + case LC_LOADFVMLIB: lc_name = "LC_LOADFVMLIB"; break; + case LC_IDFVMLIB: lc_name = "LC_IDFVMLIB"; break; + case LC_IDENT: lc_name = "LC_IDENT"; break; + case LC_FVMFILE: lc_name = "LC_FVMFILE"; break; + case LC_PREPAGE: lc_name = "LC_PREPAGE"; break; + case LC_DYSYMTAB: lc_name = "LC_DYSYMTAB"; break; + case LC_LOAD_DYLIB: lc_name = "LC_LOAD_DYLIB"; break; + case LC_ID_DYLIB: lc_name = "LC_ID_DYLIB"; break; + case LC_LOAD_DYLINKER: lc_name = "LC_LOAD_DYLINKER"; break; + case LC_ID_DYLINKER: lc_name = "LC_ID_DYLINKER"; break; + case LC_PREBOUND_DYLIB: lc_name = "LC_PREBOUND_DYLIB"; break; + case LC_ROUTINES: lc_name = "LC_ROUTINES"; break; + case LC_SUB_FRAMEWORK: lc_name = "LC_SUB_FRAMEWORK"; break; + case LC_SUB_UMBRELLA: lc_name = "LC_SUB_UMBRELLA"; break; + case LC_SUB_CLIENT: lc_name = "LC_SUB_CLIENT"; break; + case LC_SUB_LIBRARY: lc_name = "LC_SUB_LIBRARY"; break; + case LC_TWOLEVEL_HINTS: lc_name = "LC_TWOLEVEL_HINTS"; break; + case LC_PREBIND_CKSUM: lc_name = "LC_PREBIND_CKSUM"; break; + case LC_LOAD_WEAK_DYLIB: lc_name = "LC_LOAD_WEAK_DYLIB"; break; + case LC_SEGMENT_64: + { + lc_name = "LC_SEGMENT_64"; + add_segment(inFile, pCurrent, TRUE); + break; + } + case LC_ROUTINES_64: lc_name = "LC_ROUTINES_64"; break; + case LC_UUID: lc_name = "LC_UUID"; break; + case LC_RPATH: lc_name = "LC_RPATH"; break; + case LC_CODE_SIGNATURE: lc_name = "LC_CODE_SIGNATURE"; break; + case LC_SEGMENT_SPLIT_INFO: + lc_name = "LC_SEGMENT_SPLIT_INFO"; break; + case LC_REEXPORT_DYLIB: lc_name = "LC_REEXPORT_DYLIB"; break; + case LC_LAZY_LOAD_DYLIB: lc_name = "LC_LAZY_LOAD_DYLIB"; break; + case LC_ENCRYPTION_INFO: lc_name = "LC_ENCRYPTION_INFO"; break; + case LC_DYLD_INFO: lc_name = "LC_DYLD_INFO"; break; + case LC_DYLD_INFO_ONLY: lc_name = "LC_DYLD_INFO_ONLY"; break; + case LC_LOAD_UPWARD_DYLIB: lc_name = "LC_LOAD_UPWARD_DYLIB"; break; + case LC_VERSION_MIN_MACOSX: + lc_name = "LC_VERSION_MIN_MACOSX"; break; + case LC_VERSION_MIN_IPHONEOS: + lc_name = "LC_VERSION_MIN_IPHONEOS"; break; + case LC_FUNCTION_STARTS: lc_name = "LC_FUNCTION_STARTS"; break; + case LC_DYLD_ENVIRONMENT: lc_name = "LC_DYLD_ENVIRONMENT"; break; + default: lc_name=NULL; break; + } + if( gVerbosity > 1 ) + { + if(lc_name) + fprintf(stderr,"command %s: size=%d\n",lc_name, + command->cmdsize ); + else + fprintf(stderr,"command %x, size=%d\n",command->cmd, + command->cmdsize); + } + pCurrent += command->cmdsize; + } + return 0; +} + +#define HOSTORDER_VALUE(val) (isBigEndian ? OSSwapBigToHostInt32(val) : (val)) + +static macho_file_t *load_file(macho_file_t *inFile) +{ + macho_file_t *rval = NULL; + void *pCurrent = inFile->mapped; + struct fat_header *fat = (struct fat_header *)pCurrent; + + if( fat->magic==FAT_MAGIC || fat->magic==FAT_CIGAM ) + { + int isBigEndian = fat->magic == FAT_CIGAM; + unsigned int ii = 0; + struct fat_arch *pArch = NULL; + uint32_t nfat_arch = 0; + + pCurrent += sizeof(struct fat_header); + pArch = pCurrent; + nfat_arch = HOSTORDER_VALUE(fat->nfat_arch); + for( ii=0; iifilename = strdup(inFile->filename); + archfile->mapped = inFile->mapped + + HOSTORDER_VALUE(pArch->offset); + archfile->size = HOSTORDER_VALUE(pArch->size); + archfile->align = HOSTORDER_VALUE(pArch->align); + archfile->isBigEndian = isBigEndian; + archfile->cpu_type = HOSTORDER_VALUE(pArch->cputype); + archfile->cpu_subtype = HOSTORDER_VALUE(pArch->cpusubtype); + if( load_architecture(archfile) == 0 ) + { + archfile->next = rval; + rval = archfile; + } + } + else + return NULL; /* no memory */ + pArch++; + } + } + else + { + struct mach_header* header = (struct mach_header*)pCurrent; + + if( header->magic != MH_MAGIC && header->magic != MH_MAGIC_64 ) + { + fprintf(stderr, "%s is not a mach-o file\n", inFile->filename); + } + else if( header->filetype == MH_BUNDLE ) + { + fprintf(stderr, "%s is not a mach-o executable file " + "(filetype MH_BUNDLE, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename); + } + else if( header->filetype == MH_DYLINKER ) + { + fprintf(stderr, "%s is not a mach-o executable file " + "(filetype MH_DYLINKER, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename); + } + else if( !(header->filetype == MH_EXECUTE || header->filetype == MH_DYLIB) ) + { + fprintf(stderr, "%s is not a mach-o executable file " + "(filetype %d should be MH_EXECUTE or MH_DYLIB)\n", + inFile->filename, header->filetype ); + } + if( load_architecture(inFile) == 0 ) + { + inFile->next = 0; + rval = inFile; + } + } + return rval; +} + +#define FIPS_SIGNATURE_SIZE 20 +#define FIPS_FINGERPRINT_SIZE 40 + +static void debug_symbol( symtab_entry_t* sym ) +{ + if( gVerbosity > 1 ) + { + section_t* sec = sym->section; + segment_t* seg = sec->segment; + fprintf(stderr, "%-40.40s: %llx sect=%s, segment=%s prot=(%x->%x)\n", + sym->n_symbol, sym->n_value, sec->sectname, + seg->segname, seg->initprot, seg->maxprot ); + } +} + +/* + * Minimalistic HMAC from fips_standalone_sha1.c + */ +static void hmac_init(SHA_CTX *md_ctx,SHA_CTX *o_ctx, + const char *key) + { + size_t len=strlen(key); + int i; + unsigned char keymd[HMAC_MAX_MD_CBLOCK]; + unsigned char pad[HMAC_MAX_MD_CBLOCK]; + + if (len > SHA_CBLOCK) + { + SHA1_Init(md_ctx); + SHA1_Update(md_ctx,key,len); + SHA1_Final(keymd,md_ctx); + len=20; + } + else + memcpy(keymd,key,len); + memset(&keymd[len],'\0',HMAC_MAX_MD_CBLOCK-len); + + for(i=0 ; i < HMAC_MAX_MD_CBLOCK ; i++) + pad[i]=0x36^keymd[i]; + SHA1_Init(md_ctx); + SHA1_Update(md_ctx,pad,SHA_CBLOCK); + + for(i=0 ; i < HMAC_MAX_MD_CBLOCK ; i++) + pad[i]=0x5c^keymd[i]; + SHA1_Init(o_ctx); + SHA1_Update(o_ctx,pad,SHA_CBLOCK); + } + +static void hmac_final(unsigned char *md,SHA_CTX *md_ctx,SHA_CTX *o_ctx) + { + unsigned char buf[20]; + + SHA1_Final(buf,md_ctx); + SHA1_Update(o_ctx,buf,sizeof buf); + SHA1_Final(md,o_ctx); + } + +static int fingerprint(macho_file_t* inFile, int addFingerprint) +{ + int rval = 0; + unsigned char signature[FIPS_SIGNATURE_SIZE]; + char signature_string[FIPS_FINGERPRINT_SIZE+1]; + unsigned int len = sizeof(signature); + const char *fingerprint = NULL; + int ii = 0; + +#define LOOKUP_SYMBOL( symname, prot ) \ + symtab_entry_t *symname = \ + lookup_entry_by_name( inFile, "_" #symname ); \ + if( ! symname ) { \ + fprintf(stderr, "%s: Not a FIPS executable (" \ + #symname " not found)\n", inFile->filename ); \ + return -1;\ + } \ + if( (symname->section->segment->initprot & \ + (PROT_READ|PROT_WRITE|PROT_EXEC)) != (prot) ) { \ + fprintf(stderr, #symname \ + " segment has the wrong protection.\n"); \ + debug_symbol(symname);return -1;\ + } + + LOOKUP_SYMBOL( FIPS_rodata_start, PROT_READ | PROT_EXEC ); + LOOKUP_SYMBOL( FIPS_rodata_end, PROT_READ | PROT_EXEC ); + LOOKUP_SYMBOL( FIPS_text_startX, PROT_READ | PROT_EXEC ); + LOOKUP_SYMBOL( FIPS_text_endX, PROT_READ | PROT_EXEC ); + LOOKUP_SYMBOL( FIPS_signature, PROT_WRITE | PROT_READ ); + LOOKUP_SYMBOL( FINGERPRINT_ascii_value, PROT_READ | PROT_EXEC ); + + if( gVerbosity > 1 ) + { + debug_symbol( FIPS_rodata_start ); + debug_symbol( FIPS_rodata_end ); + debug_symbol( FIPS_text_startX ); + debug_symbol( FIPS_text_endX ); + debug_symbol( FIPS_signature ); + debug_symbol( FINGERPRINT_ascii_value ); + + fingerprint = (const char *)FINGERPRINT_ascii_value->mapped; + fprintf(stderr, "fingerprint: "); + for(ii=0; ii<40; ii++ ) + { + if( fingerprint[ii] == 0 ) + break; + putc(fingerprint[ii], stderr); + } + putc('\n', stderr); + } + + /* check for the prefix ? character */ + { + const unsigned char * p1 = FIPS_text_startX->mapped; + const unsigned char * p2 = FIPS_text_endX->mapped; + const unsigned char * p3 = FIPS_rodata_start->mapped; + const unsigned char * p4 = FIPS_rodata_end->mapped; + static const char FIPS_hmac_key[]="etaonrishdlcupfm"; + SHA_CTX md_ctx,o_ctx; + + hmac_init(&md_ctx,&o_ctx,FIPS_hmac_key); + + if (p1<=p3 && p2>=p3) + p3=p1, p4=p2>p4?p2:p4, p1=NULL, p2=NULL; + else if (p3<=p1 && p4>=p1) + p3=p3, p4=p2>p4?p2:p4, p1=NULL, p2=NULL; + + if (p1) { + + SHA1_Update(&md_ctx,p1,(size_t)p2-(size_t)p1); + } + if (FIPS_signature->mapped>=p3 && FIPS_signature->mappedmapped+FIPS_SIGNATURE_SIZE; + if (p3mapped; + inFile->fingerprint_original = strndup(fingerprint,FIPS_FINGERPRINT_SIZE); + inFile->fingerprint_computed = strdup(signature_string); + + if( addFingerprint ) + { + void *fp_page = NULL; + void *fp_end = NULL; + + if(strcmp(fingerprint,"?have to make sure this string is unique")!=0) + { + if (memcmp((char*)fingerprint, signature_string, FIPS_FINGERPRINT_SIZE)!=0) + { + fprintf(stderr, + "%s(%s) original fingerprint incorrect: %s\n", + inFile->filename, + cputype(inFile->cpu_type, inFile->cpu_subtype), + fingerprint); + } + } + + fp_page = (void*)((uintptr_t)fingerprint & ~PAGE_MASK); + fp_end = (void*)((uintptr_t)(fingerprint+(PAGE_SIZE*2)) & ~PAGE_MASK); + if( mprotect( fp_page, fp_end-fp_page, PROT_READ|PROT_WRITE ) ) + { + perror("Can't write the fingerprint - mprotect failed"); + fprintf(stderr, "fp_page=%p, fp_end=%p, len=%ld\n", + fp_page, fp_end, (size_t)(fp_end-fp_page)); + rval = 1; + } + else + { + memcpy((char*)fingerprint, signature_string, FIPS_FINGERPRINT_SIZE); + if( msync(fp_page, (fp_end-fp_page), 0) ) + perror("msync failed"); + } + if( gVerbosity > 0 ) + fprintf(stderr, "%s(%s) fingerprint: %s\n", inFile->filename, + cputype(inFile->cpu_type,inFile->cpu_subtype), + signature_string); + } + if( *fingerprint == '?' ) + { + printf("%s(%s) has no fingerprint.\n", inFile->filename, + cputype(inFile->cpu_type, inFile->cpu_subtype)); + rval = 2; + } + else if( strncmp( fingerprint, signature_string, FIPS_FINGERPRINT_SIZE) == 0 ) + { + if( ! addFingerprint ) + printf("%s(%s) fingerprint is correct: %s\n", inFile->filename, + cputype(inFile->cpu_type, inFile->cpu_subtype), + signature_string); + } + else + { + printf("%s(%s) fingerprint %.40s is not correct\n", inFile->filename, + cputype(inFile->cpu_type,inFile->cpu_subtype), fingerprint); + printf("calculated: %s\n", signature_string); + rval = -1; + } + return rval; +} + +static int make_fingerprint( const char * inApp, int addFingerprint ) +{ + int rval = 1; + int appfd = -1; + if( addFingerprint ) + appfd = open( inApp, O_RDWR ); + if( appfd < 0 ) + { + if( addFingerprint ) + fprintf(stderr, "Can't modify %s. Verifying only.\n", inApp); + addFingerprint = 0; + appfd = open( inApp, O_RDONLY ); + } + if( appfd >= 0 ) + { + struct stat stbuf; + fstat(appfd, &stbuf); + void * pApp = mmap(0, (size_t)stbuf.st_size, PROT_READ, + MAP_SHARED, appfd, (off_t)0); + if( pApp == MAP_FAILED ) + { + perror(inApp); + } + else + { + macho_file_t theFile; + macho_file_t* architectures; + macho_file_t* pArchitecture; + + memset( &theFile, 0, sizeof(theFile) ); + theFile.filename = inApp; + theFile.mapped = pApp; + architectures = load_file(&theFile); + for( pArchitecture = architectures; pArchitecture; + pArchitecture = pArchitecture->next ) + { + rval = fingerprint(pArchitecture, addFingerprint); + if( rval && addFingerprint ) + { + printf("Failure\n"); + break; + } + } + if((rval==0) && addFingerprint) + { + printf("Fingerprint Stored\n"); + } + munmap(pApp, (size_t)stbuf.st_size); + } + close(appfd); + } + else + { + fprintf(stderr, "Can't open %s\n", inApp ); + } + return rval; +} + +static void print_usage(const char * prog) +{ + fprintf(stderr, "usage:\n\t%s [--debug] [--quiet] [-exe|-dso|-dylib] executable\n", prog); + _exit(1); +} + +int main (int argc, const char * argv[]) +{ + const char * pname = argv[0]; + const char * filename = NULL; + int addFingerprint = 1; + const char * verbose_env = getenv("FIPS_SIG_VERBOSE"); + + if( verbose_env ) + gVerbosity = atoi(verbose_env); + + if( gVerbosity < 0 ) + gVerbosity = 1; + + while( --argc ) + { + ++argv; + if( strcmp(*argv,"-exe")==0 || strcmp(*argv,"--exe")==0 || + strcmp(*argv,"-dso")==0 || strcmp(*argv,"--dso")==0 || + strcmp(*argv,"-dylib")==0 || strcmp(*argv,"--dylib")==0 || + strcmp(*argv,"--verify")==0 ) + { + if(strcmp(*argv,"--verify")==0) + addFingerprint=0; + + if( argc > 0 ) + { + filename = *++argv; + argc--; + } + } + else if(strcmp(*argv,"-d")==0 || strcmp(*argv,"-debug")==0 || strcmp(*argv,"--debug")==0) + { + if( gVerbosity < 2 ) + gVerbosity = 2; + else + gVerbosity++; + } + else if(strcmp(*argv,"-q")==0 || strcmp(*argv,"-quiet")==0 || strcmp(*argv,"--quiet")==0) + gVerbosity = 0; + else if(strncmp(*argv,"-",1)!=0) { + filename = *argv; + } + } + + if( !filename ) + { + print_usage(pname); + return 1; + } + + if( access(filename, R_OK) ) + { + fprintf(stderr, "Can't access %s\n", filename); + return 1; + } + + return make_fingerprint( filename, addFingerprint ); +} + diff --git a/ms/do_fips.bat b/ms/do_fips.bat index 73b0a3e8e4..357f8fc76e 100644 --- a/ms/do_fips.bat +++ b/ms/do_fips.bat @@ -1,7 +1,10 @@ -@echo off +rem @echo off SET ASM=%1 SET EXARG= +SET MFILE=ntdll.mak + +if NOT X%OSVERSION% == X goto wince if NOT X%PROCESSOR_ARCHITECTURE% == X goto defined @@ -42,6 +45,14 @@ SET TARGET=VC-WIN64A if x%ASM% == xno-asm goto compile SET ASM=nasm +goto compile + +:wince + +echo Auto Configuring for WinCE +SET TARGET=VC-CE +SET MFILE=cedll.mak + :compile if x%ASM% == xno-asm SET EXARG=no-asm @@ -52,13 +63,13 @@ echo on perl util\mkfiles.pl >MINFO @if ERRORLEVEL 1 goto error -perl util\mk1mf.pl dll %ASM% %TARGET% >ms\ntdll.mak +perl util\mk1mf.pl dll %ASM% %TARGET% >ms\%MFILE% @if ERRORLEVEL 1 goto error -nmake -f ms\ntdll.mak clean -nmake -f ms\ntdll.mak +nmake -f ms\%MFILE% clean +nmake -f ms\%MFILE% @if ERRORLEVEL 1 goto error -nmake -f ms\ntdll.mak install +nmake -f ms\%MFILE% install @if ERRORLEVEL 1 goto error @echo. diff --git a/test/Makefile b/test/Makefile index 2577d245b9..3f9770663b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,6 +12,7 @@ PERL= perl # KRB5 stuff KRB5_INCLUDES= LIBKRB5= +TEST= fips_algvs.c PEX_LIBS= EX_LIBS= #-lnsl -lsocket @@ -81,6 +82,7 @@ FIPS_ECDHVS= fips_ecdhvs FIPS_ECDSAVS= fips_ecdsavs FIPS_TEST_SUITE=fips_test_suite FIPS_CMACTEST= fips_cmactest +FIPS_ALGVS= fips_algvs TESTS= alltests @@ -119,7 +121,7 @@ OBJ= $(BNTEST).o $(ECTEST).o $(ECDSATEST).o $(ECDHTEST).o $(IDEATEST).o \ $(FIPS_RSASTEST).o $(FIPS_RSAGTEST).o $(FIPS_GCMTEST).o \ $(FIPS_DSSVS).o $(FIPS_DSATEST).o $(FIPS_RNGVS).o $(FIPS_DRBGVS).o \ $(FIPS_TEST_SUITE).o $(FIPS_DHVS).o $(FIPS_ECDSAVS).o \ - $(FIPS_ECDHVS).o $(FIPS_CMACTEST).o \ + $(FIPS_ECDHVS).o $(FIPS_CMACTEST).o $(FIPS_ALGVS).o \ $(EVPTEST).o $(IGETEST).o $(JPAKETEST).o SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ $(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \ @@ -133,7 +135,7 @@ SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ $(FIPS_RSASTEST).c $(FIPS_RSAGTEST).c $(FIPS_GCMTEST).c \ $(FIPS_DSSVS).c $(FIPS_DSATEST).c $(FIPS_RNGVS).c $(FIPS_DRBGVS).c \ $(FIPS_TEST_SUITE).c $(FIPS_DHVS).c $(FIPS_ECDSAVS).c \ - $(FIPS_ECDHVS).c $(FIPS_CMACTEST).c \ + $(FIPS_ECDHVS).c $(FIPS_CMACTEST).c $(FIPS_ALGVS).c \ $(EVPTEST).c $(IGETEST).c $(JPAKETEST).c EXHEADER= @@ -150,6 +152,8 @@ exe: $(EXE) $(FIPSEXE) dummytest$(EXE_EXT) fipsexe: $(FIPSEXE) +fipsalgvs: $(FIPS_ALGVS) + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO @@ -504,6 +508,9 @@ $(FIPS_TEST_SUITE)$(EXE_EXT): $(FIPS_TEST_SUITE).o $(DLIBCRYPTO) $(FIPS_CMACTEST)$(EXE_EXT): $(FIPS_CMACTEST).o $(DLIBCRYPTO) @target=$(FIPS_CMACTEST); $(FIPS_BUILD_CMD) +$(FIPS_ALGVS)$(EXE_EXT): $(FIPS_ALGVS).o $(DLIBCRYPTO) + @target=$(FIPS_ALGVS); $(FIPS_BUILD_CMD) + $(RMDTEST)$(EXE_EXT): $(RMDTEST).o $(DLIBCRYPTO) @target=$(RMDTEST); $(BUILD_CMD) diff --git a/test/fips_algvs.c b/test/fips_algvs.c new file mode 100644 index 0000000000..2bfd213a0e --- /dev/null +++ b/test/fips_algvs.c @@ -0,0 +1,428 @@ +/* test/fips_algvs.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2011 + */ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include +#include +#include + +#ifndef OPENSSL_FIPS +#include + +int main(int argc, char **argv) +{ + printf("No FIPS ALGVS support\n"); + return 0; +} +#else + +#if defined(__vxworks) + +#include +#include + +int fips_algvs_main(int argc, char **argv); +#define main fips_algvs_main + +static int fips_algvs_argv(char *a0) +{ + char *argv[32] = { "fips_algvs" }; + int argc = 1; + int main_ret; + + if (a0) { + char *scan = a0, *arg = a0; + + while (*scan) { + if (*scan++ == ' ') { + scan[-1] = '\0'; + argv[argc++] = arg; + if (argc == (sizeof(argv)/sizeof(argv[0])-1)) + break; + + while (*scan == ' ') scan++; + arg = scan; + } + } + if (*scan == '\0') argv[argc++] = arg; + } + + argv[argc] = NULL; + + main_ret = fips_algvs_main(argc, argv); + + if (a0) free(a0); + + return main_ret; +} + +int fips_algvs(int a0) +{ + return taskSpawn("fips_algvs", 100, (VX_FP_TASK | VX_SPE_TASK), 100000, + (FUNCPTR)fips_algvs_argv, + a0 ? strdup(a0) : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static FILE *fips_fopen(const char *path, const char *mode) +{ + char fips_path [256]; + + if (path[0] != '/' && strlen(path) < (sizeof(fips_path)-8)) { + strcpy(fips_path,"/fips0/"); + strcat(fips_path,path); + return fopen(fips_path,mode); + } + return fopen(path,mode); +} +#define fopen fips_fopen +#endif + +#define FIPS_ALGVS + +extern int fips_aesavs_main(int argc, char **argv); +extern int fips_cmactest_main(int argc, char **argv); +extern int fips_desmovs_main(int argc, char **argv); +extern int fips_dhvs_main(int argc, char **argv); +extern int fips_drbgvs_main(int argc,char **argv); +extern int fips_dssvs_main(int argc, char **argv); +extern int fips_ecdhvs_main(int argc, char **argv); +extern int fips_ecdsavs_main(int argc, char **argv); +extern int fips_gcmtest_main(int argc, char **argv); +extern int fips_hmactest_main(int argc, char **argv); +extern int fips_rngvs_main(int argc, char **argv); +extern int fips_rsagtest_main(int argc, char **argv); +extern int fips_rsastest_main(int argc, char **argv); +extern int fips_rsavtest_main(int argc, char **argv); +extern int fips_shatest_main(int argc, char **argv); +extern int fips_test_suite_main(int argc, char **argv); + +#if !defined(_TMS320C6400_PLUS) && !defined(_TMS320C6400) +#include "fips_aesavs.c" +#include "fips_cmactest.c" +#include "fips_desmovs.c" +#include "fips_dhvs.c" +#include "fips_drbgvs.c" +#include "fips_dssvs.c" +#include "fips_ecdhvs.c" +#include "fips_ecdsavs.c" +#include "fips_gcmtest.c" +#include "fips_hmactest.c" +#include "fips_rngvs.c" +#include "fips_rsagtest.c" +#include "fips_rsastest.c" +#include "fips_rsavtest.c" +#include "fips_shatest.c" +#include "fips_test_suite.c" + +#else +#include "aes/fips_aesavs.c" +#include "cmac/fips_cmactest.c" +#include "des/fips_desmovs.c" +#include "dh/fips_dhvs.c" +#include "rand/fips_drbgvs.c" +#include "dsa/fips_dssvs.c" +#include "ecdh/fips_ecdhvs.c" +#include "ecdsa/fips_ecdsavs.c" +#include "aes/fips_gcmtest.c" +#include "hmac/fips_hmactest.c" +#include "rand/fips_rngvs.c" +#include "rsa/fips_rsagtest.c" +#include "rsa/fips_rsastest.c" +#include "rsa/fips_rsavtest.c" +#include "sha/fips_shatest.c" +#include "fips_test_suite.c" + +#pragma DATA_SECTION(aucCmBootDspLoad, "BootDspSection"); +volatile unsigned char aucCmBootDspLoad[8*1024]; +#endif + +typedef struct + { + const char *name; + int (*func)(int argc, char **argv); + } ALGVS_FUNCTION; + +static ALGVS_FUNCTION algvs[] = { + {"fips_aesavs", fips_aesavs_main}, + {"fips_cmactest", fips_cmactest_main}, + {"fips_desmovs", fips_desmovs_main}, + {"fips_dhvs", fips_dhvs_main}, + {"fips_drbgvs", fips_drbgvs_main}, + {"fips_dssvs", fips_dssvs_main}, + {"fips_ecdhvs", fips_ecdhvs_main}, + {"fips_ecdsavs", fips_ecdsavs_main}, + {"fips_gcmtest", fips_gcmtest_main}, + {"fips_hmactest", fips_hmactest_main}, + {"fips_rngvs", fips_rngvs_main}, + {"fips_rsagtest", fips_rsagtest_main}, + {"fips_rsastest", fips_rsastest_main}, + {"fips_rsavtest", fips_rsavtest_main}, + {"fips_shatest", fips_shatest_main}, + {"fips_test_suite", fips_test_suite_main}, + {NULL, 0} + }; + +/* Argument parsing taken from apps/apps.c */ + +typedef struct args_st + { + char **data; + int count; + } ARGS; + +static int chopup_args(ARGS *arg, char *buf, int *argc, char **argv[]) + { + int num,i; + char *p; + + *argc=0; + *argv=NULL; + + i=0; + if (arg->count == 0) + { + arg->count=20; + arg->data=(char **)OPENSSL_malloc(sizeof(char *)*arg->count); + } + for (i=0; icount; i++) + arg->data[i]=NULL; + + num=0; + p=buf; + for (;;) + { + /* first scan over white space */ + if (!*p) break; + while (*p && ((*p == ' ') || (*p == '\t') || (*p == '\n'))) + p++; + if (!*p) break; + + /* The start of something good :-) */ + if (num >= arg->count) + { + fprintf(stderr, "Too many arguments!!\n"); + return 0; + } + arg->data[num++]=p; + + /* now look for the end of this */ + if ((*p == '\'') || (*p == '\"')) /* scan for closing quote */ + { + i= *(p++); + arg->data[num-1]++; /* jump over quote */ + while (*p && (*p != i)) + p++; + *p='\0'; + } + else + { + while (*p && ((*p != ' ') && + (*p != '\t') && (*p != '\n'))) + p++; + + if (*p == '\0') + p--; + else + *p='\0'; + } + p++; + } + *argc=num; + *argv=arg->data; + return(1); + } + +static int run_prg(int argc, char **argv) + { + ALGVS_FUNCTION *t; + const char *prg_name; + prg_name = strrchr(argv[0], '/'); + if (prg_name) + prg_name++; + else + prg_name = argv[0]; + for (t = algvs; t->name; t++) + { + if (!strcmp(prg_name, t->name)) + return t->func(argc, argv); + } + return -100; + } + +int main(int argc, char **argv) + { + static char buf[1024]; + char **args = argv + 1; + const char *sname = "fipstests.sh"; + ARGS arg; + int xargc; + char **xargv; + int lineno = 0, badarg = 0; + int nerr = 0, quiet = 0, verbose = 0; + int rv; + FILE *in = NULL; +#ifdef FIPS_ALGVS_MEMCHECK + CRYPTO_malloc_debug_init(); + OPENSSL_init(); + CRYPTO_set_mem_debug_options(V_CRYPTO_MDEBUG_ALL); + CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON); +#endif + +#if defined(_TMS320C6400_PLUS) + SysInit(); +#endif + +#if (defined(__arm__) || defined(__aarch64__)) + if (*args && !strcmp(*args, "-noaccel")) + { + extern unsigned int OPENSSL_armcap_P; + + OPENSSL_armcap_P=0; + args++; + argc--; + } +#endif + if (*args && *args[0] != '-') + { + rv = run_prg(argc - 1, args); +#ifdef FIPS_ALGVS_MEMCHECK + CRYPTO_mem_leaks_fp(stderr); +#endif + return rv; + } + while (!badarg && *args && *args[0] == '-') + { + if (!strcmp(*args, "-script")) + { + if (args[1]) + { + args++; + sname = *args; + } + else + badarg = 1; + } + else if (!strcmp(*args, "-quiet")) + quiet = 1; + else if (!strcmp(*args, "-verbose")) + verbose = 1; + else + badarg = 1; + args++; + } + + if (badarg) + { + fprintf(stderr, "Error processing arguments\n"); + return 1; + } + + in = fopen(sname, "r"); + if (!in) + { + fprintf(stderr, "Error opening script file \"%s\"\n", sname); + return 1; + } + + arg.data = NULL; + arg.count = 0; + + while (fgets(buf, sizeof(buf), in)) + { + lineno++; + if (!chopup_args(&arg, buf, &xargc, &xargv)) + fprintf(stderr, "Error processing line %d\n", lineno); + else + { + if (!quiet) + { + int i; + int narg = verbose ? xargc : xargc - 2; + printf("Running command line:"); + for (i = 0; i < narg; i++) + printf(" %s", xargv[i]); + printf("\n"); + } + rv = run_prg(xargc, xargv); + if (FIPS_module_mode()) + FIPS_module_mode_set(0, NULL); + if (rv != 0) + nerr++; + if (rv == -100) + fprintf(stderr, "ERROR: Command not found\n"); + else if (rv != 0) + fprintf(stderr, "ERROR: returned %d\n", rv); + else if (verbose) + printf("\tCommand run successfully\n"); + } + } + + if (!quiet) + printf("Completed with %d errors\n", nerr); + + if (arg.data) + OPENSSL_free(arg.data); + + fclose(in); +#ifdef FIPS_ALGVS_MEMCHECK + CRYPTO_mem_leaks_fp(stderr); +#endif + if (nerr == 0) + return 0; + return 1; + } +#endif diff --git a/util/fips_standalone_sha1 b/util/fips_standalone_sha1 new file mode 100644 index 0000000000..ea2268cb4e --- /dev/null +++ b/util/fips_standalone_sha1 @@ -0,0 +1,32 @@ +#!/usr/bin/env perl +# +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +unshift(@INC,$dir); +require "hmac_sha1.pl"; + +(!@ARV[0] && -f @ARGV[$#ARGV]) || die "usage: $0 [-verify] file"; + +$verify=shift if (@ARGV[0] eq "-verify"); + +sysopen(FD,@ARGV[0],0) || die "$!"; +binmode(FD); + +my $ctx = HMAC->Init("etaonrishdlcupfm"); + +while (read(FD,$blob,4*1024)) { $ctx->Update($blob); } + +close(FD); + +my $signature = unpack("H*",$ctx->Final()); + +print "HMAC-SHA1(@ARGV[0])= $signature\n"; + +if ($verify) { + open(FD,"<@ARGV[0].sha1") || die "$!"; + $line = ; + close(FD); + exit(0) if ($line =~ /HMAC\-SHA1\([^\)]*\)=\s*([0-9a-f]+)/i && + $1 eq $signature); + die "signature mismatch"; +} diff --git a/util/fipsas.pl b/util/fipsas.pl index fc2a759308..9dfe0d895c 100644 --- a/util/fipsas.pl +++ b/util/fipsas.pl @@ -37,32 +37,31 @@ while () last if (/assembler/) } -# Store all renames. +# Store all renames [noting minimal length]. +my $minlen=0x10000; while () { - if (/^#define\s+(\w+)\s+(\w+)\b/) + if (/^#define\s+_?(\w+)\s+_?(\w+)\b/) { $edits{$1} = $2; + my $len = length($1); + $minlen = $len if ($len<$minlen); } } -my ($from, $to); +open(IN,"$target") || die "Can't open $target for reading"; -#rename target temporarily -rename($target, "tmptarg.s") || die "Can't rename $target"; +@code = ; # suck in whole file -#edit target -open(IN,"tmptarg.s") || die "Can't open temporary file"; -open(OUT, ">$target") || die "Can't open output file $target"; +close IN; -while () -{ - while (($from, $to) = each %edits) - { - s/(\b_*)$from(\b)/$1$to$2/g; - } - print OUT $_; -} +open(OUT,">$target") || die "Can't open $target for writing"; + +foreach $line (@code) + { + $line =~ s/\b(_?)(\w{$minlen,})\b/$1.($edits{$2} or $2)/geo; + print OUT $line; + } close OUT; @@ -73,18 +72,5 @@ if ($runasm) my $rv = $?; - # restore target - unlink $target; - rename "tmptarg.s", $target; - die "Error executing assembler!" if $rv != 0; } -else - { - # Don't care about target - unlink "tmptarg.s"; - } - - - - diff --git a/util/fipsdist.pl b/util/fipsdist.pl index b191fbe41e..53f9d3e18a 100644 --- a/util/fipsdist.pl +++ b/util/fipsdist.pl @@ -58,7 +58,7 @@ while () } else { - next unless (/^(fips\/|crypto|util|test|include|ms)/); + next unless (/^(fips\/|crypto|util|test|include|ms|c6x)/); } if (/^crypto\/([^\/]+)/) { @@ -76,7 +76,7 @@ while () } if (/^test\//) { - next unless /Makefile/ || /dummytest.c/; + next unless /Makefile/ || /dummytest.c/ || /fips_algvs.c/ ; } print "$_\n"; } diff --git a/util/fipslink.pl b/util/fipslink.pl index 8b6fbad7d8..0f87f7dbc9 100644 --- a/util/fipslink.pl +++ b/util/fipslink.pl @@ -33,14 +33,24 @@ check_hash($sha1_exe, "fipscanister.lib"); print "Integrity check OK\n"; -print "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c\n"; -system "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c"; -die "First stage Compile failure" if $? != 0; +if (is_premain_linked(@ARGV)) { + print "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c\n"; + system "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c"; + die "First stage Compile failure" if $? != 0; +} elsif (!defined($ENV{FIPS_SIG})) { + die "no fips_premain.obj linked"; +} print "$fips_link @ARGV\n"; system "$fips_link @ARGV"; die "First stage Link failure" if $? != 0; +if (defined($ENV{FIPS_SIG})) { + print "$ENV{FIPS_SIG} $fips_target\n"; + system "$ENV{FIPS_SIG} $fips_target"; + die "$ENV{FIPS_SIG} $fips_target failed" if $? != 0; + exit; +} print "$fips_premain_dso $fips_target\n"; system("$fips_premain_dso $fips_target >$fips_target.sha1"); @@ -57,11 +67,26 @@ print "$fips_cc -DHMAC_SHA1_SIG=\\\"$fips_hash\\\" $fips_cc_args $fips_libdir/fi system "$fips_cc -DHMAC_SHA1_SIG=\\\"$fips_hash\\\" $fips_cc_args $fips_libdir/fips_premain.c"; die "Second stage Compile failure" if $? != 0; - print "$fips_link @ARGV\n"; system "$fips_link @ARGV"; die "Second stage Link failure" if $? != 0; +sub is_premain_linked + { + return 1 if (grep /fips_premain\.obj/,@_); + foreach (@_) + { + if (/^@(.*)/ && -f $1) + { + open FD,$1 or die "can't open $1"; + my $ret = (grep /fips_premain\.obj/,)?1:0; + close FD; + return $ret; + } + } + return 0; + } + sub check_hash { my ($sha1_exe, $filename) = @_; diff --git a/util/hmac_sha1.pl b/util/hmac_sha1.pl new file mode 100755 index 0000000000..494f7e8569 --- /dev/null +++ b/util/hmac_sha1.pl @@ -0,0 +1,196 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2011 The OpenSSL Project. +# +###################################################################### +# +# SHA1 and HMAC in Perl by . +# +{ package SHA1; + use integer; + + { + ################################### SHA1 block code generator + my @V = ('$A','$B','$C','$D','$E'); + my $i; + + sub XUpdate { + my $ret; + $ret="(\$T=\$W[($i-16)%16]^\$W[($i-14)%16]^\$W[($i-8)%16]^\$W[($i-3)%16],\n\t"; + if ((1<<31)<<1) { + $ret.=" \$W[$i%16]=((\$T<<1)|(\$T>>31))&0xffffffff)\n\t "; + } else { + $ret.=" \$W[$i%16]=(\$T<<1)|((\$T>>31)&1))\n\t "; + } + } + sub tail { + my ($a,$b,$c,$d,$e)=@V; + my $ret; + if ((1<<31)<<1) { + $ret.="(($a<<5)|($a>>27));\n\t"; + $ret.="$b=($b<<30)|($b>>2); $e&=0xffffffff; #$b&=0xffffffff;\n\t"; + } else { + $ret.="(($a<<5)|($a>>27)&0x1f);\n\t"; + $ret.="$b=($b<<30)|($b>>2)&0x3fffffff;\n\t"; + } + $ret; + } + sub BODY_00_15 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=\$W[$i]+0x5a827999+((($c^$d)&$b)^$d)+".tail(); + } + sub BODY_16_19 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x5a827999+((($c^$d)&$b)^$d)+".tail(); + } + sub BODY_20_39 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x6ed9eba1+($b^$c^$d)+".tail(); + } + sub BODY_40_59 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0x8f1bbcdc+(($b&$c)|(($b|$c)&$d))+".tail(); + } + sub BODY_60_79 { + my ($a,$b,$c,$d,$e)=@V; + "$e+=".XUpdate()."+0xca62c1d6+($b^$c^$d)+".tail(); + } + + my $sha1_impl = + 'sub block { + my $self = @_[0]; + my @W = unpack("N16",@_[1]); + my ($A,$B,$C,$D,$E,$T) = @{$self->{H}}; + '; + + $sha1_impl.=' + $A &= 0xffffffff; + $B &= 0xffffffff; + ' if ((1<<31)<<1); + + for($i=0;$i<16;$i++){ $sha1_impl.=BODY_00_15(); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { $sha1_impl.=BODY_16_19(); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { $sha1_impl.=BODY_20_39(); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { $sha1_impl.=BODY_40_59(); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { $sha1_impl.=BODY_60_79(); unshift(@V,pop(@V)); } + + $sha1_impl.=' + $self->{H}[0]+=$A; $self->{H}[1]+=$B; $self->{H}[2]+=$C; + $self->{H}[3]+=$D; $self->{H}[4]+=$E; }'; + + #print $sha1_impl,"\n"; + eval($sha1_impl); # generate code + } + + sub Init { + my $class = shift; # multiple instances... + my $self = {}; + + bless $self,$class; + $self->{H} = [0x67452301,0xefcdab89,0x98badcfe,0x10325476,0xc3d2e1f0]; + $self->{N} = 0; + return $self; + } + + sub Update { + my $self = shift; + my $msg; + + foreach $msg (@_) { + my $len = length($msg); + my $num = length($self->{buf}); + my $off = 0; + + $self->{N} += $len; + + if (($num+$len)<64) + { $self->{buf} .= $msg; next; } + elsif ($num) + { $self->{buf} .= substr($msg,0,($off=64-$num)); + $self->block($self->{buf}); + } + + while(($off+64) <= $len) + { $self->block(substr($msg,$off,64)); + $off += 64; + } + + $self->{buf} = substr($msg,$off); + } + return $self; + } + + sub Final { + my $self = shift; + my $num = length($self->{buf}); + + $self->{buf} .= chr(0x80); $num++; + if ($num>56) + { $self->{buf} .= chr(0)x(64-$num); + $self->block($self->{buf}); + $self->{buf}=undef; + $num=0; + } + $self->{buf} .= chr(0)x(56-$num); + $self->{buf} .= pack("N2",($self->{N}>>29)&0x7,$self->{N}<<3); + $self->block($self->{buf}); + + return pack("N*",@{$self->{H}}); + } + + sub Selftest { + my $hash; + + $hash=SHA1->Init()->Update('abc')->Final(); + die "SHA1 test#1" if (unpack("H*",$hash) ne 'a9993e364706816aba3e25717850c26c9cd0d89d'); + + $hash=SHA1->Init()->Update('abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq')->Final(); + die "SHA1 test#2" if (unpack("H*",$hash) ne '84983e441c3bd26ebaae4aa1f95129e5e54670f1'); + + #$hash=SHA1->Init()->Update('a'x1000000)->Final(); + #die "SHA1 test#3" if (unpack("H*",$hash) ne '34aa973cd4c4daa4f61eeb2bdbad27316534016f'); + } +} + +{ package HMAC; + + sub Init { + my $class = shift; + my $key = shift; + my $self = {}; + + bless $self,$class; + + if (length($key)>64) { + $key = SHA1->Init()->Update($key)->Final(); + } + $key .= chr(0x00)x(64-length($key)); + + my @ikey = map($_^=0x36,unpack("C*",$key)); + ($self->{hash} = SHA1->Init())->Update(pack("C*",@ikey)); + $self->{okey} = pack("C*",map($_^=0x36^0x5c,@ikey)); + + return $self; + } + + sub Update { + my $self = shift; + $self->{hash}->Update(@_); + return $self; + } + + sub Final { + my $self = shift; + my $ihash = $self->{hash}->Final(); + return SHA1->Init()->Update($self->{okey},$ihash)->Final(); + } + + sub Selftest { + my $hmac; + + $hmac = HMAC->Init('0123456789:;<=>?@ABC')->Update('Sample #2')->Final(); + die "HMAC test" if (unpack("H*",$hmac) ne '0922d3405faa3d194f82a45830737d5cc6c75d24'); + } +} + +1; diff --git a/util/incore b/util/incore index 883f63ff56..bb765b1966 100755 --- a/util/incore +++ b/util/incore @@ -34,6 +34,7 @@ @e_ident{magic,class,data,version,osabi,abiver,pad}= unpack("a4C*",$elf); + $!=42; # signal fipsld to revert to two-step link die "not ELF file" if ($e_ident{magic} ne chr(0177)."ELF"); my $elf_bits = $e_ident{class}*32; # 32 or 64 @@ -377,11 +378,11 @@ $FIPS_text_endX = $exe->Lookup("FIPS_text_endX"); if (!$legacy_mode) { if (!$FIPS_text_startX || !$FIPS_text_endX) { print STDERR "@ARGV[$#ARGV] is not cross-compiler aware.\n"; - exit(1); + exit(42); # signal fipsld to revert to two-step link } $FINGERPRINT_ascii_value - = $exe->Lookup("FINGERPRINT_ascii_value") or die; + = $exe->Lookup("FINGERPRINT_ascii_value"); } if ($FIPS_text_startX && $FIPS_text_endX) { @@ -438,9 +439,12 @@ $fingerprint = FIPS_incore_fingerprint(); if ($legacy_mode) { print unpack("H*",$fingerprint); -} else { +} elsif (defined($FINGERPRINT_ascii_value)) { seek(FD,$FINGERPRINT_ascii_value->{st_offset},0) or die "$!"; print FD unpack("H*",$fingerprint) or die "$!"; +} else { + seek(FD,$FIPS_signature->{st_offset},0) or die "$!"; + print FD $fingerprint or die "$!"; } close (FD); diff --git a/util/mk1mf.pl b/util/mk1mf.pl index af039c78ac..5c4c50ab35 100755 --- a/util/mk1mf.pl +++ b/util/mk1mf.pl @@ -23,6 +23,7 @@ local $fips_canister_path = ""; my $fips_premain_dso_exe_path = ""; my $fips_premain_c_path = ""; my $fips_sha1_exe_path = ""; +my $fips_sha1_exe_build = 1; local $fipscanisterbuild = 0; @@ -248,6 +249,10 @@ elsif (($platform eq "netware-clib") || ($platform eq "netware-libc") || $BSDSOCK=1 if ($platform eq "netware-libc-bsdsock") || ($platform eq "netware-clib-bsdsock"); require 'netware.pl'; } +elsif ($platform =~ /^c64x/) + { + require "TI_CGTOOLS.pl"; + } else { require "unix.pl"; @@ -500,8 +505,16 @@ if ($fips) { if ($fips_sha1_exe_path eq "") { - $fips_sha1_exe_path = - "\$(BIN_D)${o}fips_standalone_sha1$exep"; + $fips_sha1_exe_path = $ENV{"FIPS_SHA1_PATH"}; + if (defined $fips_sha1_exe_path) + { + $fips_sha1_exe_build = 0; + } + else + { + $fips_sha1_exe_path = + "\$(BIN_D)${o}fips_standalone_sha1$exep"; + } } } else @@ -545,7 +558,7 @@ if ($fips) if ($fipscanisteronly) { - $build_targets = "\$(O_FIPSCANISTER) \$(T_EXE)"; + $build_targets = "\$(O_FIPSCANISTER)"; $libs_dep = ""; } @@ -567,9 +580,14 @@ if ($fipscanisteronly) \$(CP) \"fips${o}fips_premain.c.sha1\" \"\$(INSTALLTOP)${o}lib\" \$(CP) \"\$(INCO_D)${o}fips.h\" \"\$(INSTALLTOP)${o}include${o}openssl\" \$(CP) \"\$(INCO_D)${o}fips_rand.h\" \"\$(INSTALLTOP)${o}include${o}openssl\" - \$(CP) "\$(BIN_D)${o}fips_standalone_sha1$exep" \"\$(INSTALLTOP)${o}bin\" \$(CP) \"util${o}fipslink.pl\" \"\$(INSTALLTOP)${o}bin\" EOF + if ($fips_sha1_exe_build) + { + $extra_install .= <<"EOF"; + \$(CP) "\$(BIN_D)${o}fips_standalone_sha1$exep" \"\$(INSTALLTOP)${o}bin\" +EOF + } } elsif ($shlib) { @@ -716,7 +734,7 @@ LIBS_DEP=$libs_dep EOF $rules=<<"EOF"; -all: banner \$(TMP_D) \$(BIN_D) \$(TEST_D) \$(LIB_D) \$(INCO_D) headers \$(FIPS_SHA1_EXE) $build_targets +all: banner \$(TMP_D) \$(BIN_D) \$(TEST_D) \$(LIB_D) \$(INCO_D) headers $build_targets banner: $banner @@ -744,7 +762,11 @@ headers: \$(HEADER) \$(EXHEADER) lib: \$(LIBS_DEP) \$(E_SHLIB) -exe: \$(T_EXE) \$(BIN_D)$o\$(E_EXE)$exep +exe: \$(BIN_D)$o\$(E_EXE)$exep + +build_tests: \$(T_EXE) + +build_algvs: \$(T_SRC) \$(BIN_D)${o}fips_algvs$exep install: all \$(MKDIR) \"\$(INSTALLTOP)\" @@ -842,10 +864,13 @@ if ($fips) } $rules.=&cc_compile_target("\$(OBJ_D)${o}fips_standalone_sha1$obj", "fips${o}sha${o}fips_standalone_sha1.c", - "\$(SHLIB_CFLAGS)"); + "\$(APP_CFLAGS)"); $rules.=&cc_compile_target("\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj", "fips${o}fips_premain.c", - "-DFINGERPRINT_PREMAIN_DSO_LOAD \$(SHLIB_CFLAGS)"); + "-DFINGERPRINT_PREMAIN_DSO_LOAD \$(APP_CFLAGS)"); + $rules.=&cc_compile_target("\$(OBJ_D)${o}fips_algvs$obj", + "test${o}fips_algvs.c", + "\$(APP_CFLAGS)"); } foreach (values %lib_nam) @@ -878,6 +903,7 @@ EOF } $defs.=&do_defs("T_EXE",$test,"\$(TEST_D)",$exep); +$defs.=&do_defs("T_SRC",$test,"\$(TMP_D)",".c"); foreach (split(/\s+/,$test)) { my $t_libs; @@ -899,8 +925,11 @@ foreach (split(/\s+/,$test)) $tt="\$(OBJ_D)${o}$t${obj}"; $rules.=&do_link_rule("\$(TEST_D)$o$t$exep",$tt,"\$(LIBS_DEP)","$t_libs \$(EX_LIBS)", $ltype); + $rules.=&do_copy_rule("\$(TMP_D)",$_,".c"); } + $rules.=&do_link_rule("\$(TEST_D)${o}fips_algvs$exep","\$(OBJ_D)${o}fips_algvs$obj","\$(LIBS_DEP)","\$(O_FIPSCANISTER) \$(EX_LIBS)", 2) if $fips; + $defs.=&do_defs("E_SHLIB",$engines . $otherlibs,"\$(ENG_D)",$shlibp); foreach (split(/\s+/,$engines)) @@ -955,20 +984,20 @@ if ($fips) "\$(OBJ_D)${o}fips_start$obj", "\$(FIPSOBJ)", "\$(OBJ_D)${o}fips_end$obj", - "\$(FIPS_SHA1_EXE)", ""); + ""); # FIXME $rules.=&do_link_rule("\$(FIPS_SHA1_EXE)", "\$(OBJ_D)${o}fips_standalone_sha1$obj \$(OBJ_D)${o}sha1dgst$obj $sha1_asm_obj", - "","\$(EX_LIBS)", 1); + "","\$(EX_LIBS)", 1) if $fips_sha1_exe_build; } else { $rules.=&do_link_rule("\$(FIPS_SHA1_EXE)", "\$(OBJ_D)${o}fips_standalone_sha1$obj \$(O_FIPSCANISTER)", - "","", 1); + "","", 1) if $fips_sha1_exe_build; } - $rules.=&do_link_rule("\$(PREMAIN_DSO_EXE)","\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj \$(CRYPTOOBJ) \$(O_FIPSCANISTER)","","\$(EX_LIBS)", 1); + $rules.=&do_link_rule("\$(PREMAIN_DSO_EXE)","\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj \$(CRYPTOOBJ) \$(O_FIPSCANISTER)","","\$(EX_LIBS)", 1) unless defined $ENV{"FIPS_SIG"}; } @@ -1192,6 +1221,10 @@ sub do_compile_rule { $ret.=&Sasm_compile_target("$to${o}$n$obj",$s,$n); } + elsif (-f ($s="${d}${o}asm${o}${n}.asm")) + { + $ret.=&cc_compile_target("$to${o}$n$obj","$s",$ex); + } else { die "no rule for $_"; } } return($ret); diff --git a/util/mklink.pl b/util/mklink.pl index 61db12c68f..72a562ecaf 100755 --- a/util/mklink.pl +++ b/util/mklink.pl @@ -52,6 +52,7 @@ my $to = join('/', @to_path); my $file; $symlink_exists=eval {symlink("",""); 1}; if ($^O eq "msys") { $symlink_exists=0 }; +if ($^O eq "MSWin32") { $symlink_exists=0 }; foreach $file (@files) { my $err = ""; if ($symlink_exists) { diff --git a/util/msincore b/util/msincore new file mode 100755 index 0000000000..08f81be8d5 --- /dev/null +++ b/util/msincore @@ -0,0 +1,169 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2012 The OpenSSL Project. +# +# The script embeds fingerprint into Microsoft PE-COFF executable object. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +unshift(@INC,$dir); +require "hmac_sha1.pl"; + +###################################################################### +# +# PE-COFF segment table parser by . +# +{ package PECOFF; + use FileHandle; + + sub dup { my %copy=map {$_} @_; return \%copy; } + + sub Load { + my $class = shift; + my $self = {}; + my $FD = FileHandle->new(); # autoclose + my $file = shift; + + bless $self,$class; + + sysopen($FD,$file,0) or die "$!"; + binmode($FD); + + ################################################# + # read IMAGE_DOS_HEADER + # + read($FD,my $mz,64) or die "$!"; + my @dos_header=unpack("a2C58V",$mz); + + $!=42; # signal fipsld to revert to two-step link + die "$file is not PE-COFF image" if (@dos_header[0] ne "MZ"); + + my $e_lfanew=pop(@dos_header); + seek($FD,$e_lfanew,0) or die "$!"; + read($FD,my $magic,4) or die "$!"; + + $!=42; # signal fipsld to revert to two-step link + die "$file is not PE-COFF image" if (unpack("V",$magic)!=0x4550); + + ################################################# + # read and parse COFF header... + # + read($FD,my $coff,20) or die "$!"; + + my %coff_header; + @coff_header{machine,nsects,date,syms_off,nsyms,opt,flags}= + unpack("v2V3v2",$coff); + + my $strings; + my $symsize; + + ################################################# + # load strings table + # + if ($coff_header{syms_off}) { + seek($FD,$coff_header{syms_off}+18*$coff_header{nsyms},0) or die "$!"; + read($FD,$strings,4) or die "$!"; + $symsize = unpack("V",$strings); + read($FD,$strings,$symsize,4) or die "$!"; + } + + ################################################# + # read sections + # + my $i; + + # seek to section headers + seek($FD,$e_lfanew+24+@coff_header{opt},0) or die "$!"; + + for ($i=0;$i<$coff_header{nsects};$i++) { + my %coff_shdr; + my $name; + + read($FD,my $section,40) or die "$!"; + + @coff_shdr{sh_name,sh_vsize,sh_vaddr, + sh_rawsize,sh_offset,sh_relocs,sh_lines, + sh_nrelocls,sh_nlines,sh_flags} = + unpack("a8V6v2V",$section); + + $name = $coff_shdr{sh_name}; + # see if sh_name is an offset in $strings + my ($hi,$lo) = unpack("V2",$name); + if ($hi==0 && $lo<$symsize) { + $name = substr($strings,$lo,64); + } + $name = (split(chr(0),$name))[0]; + $coff_shdr{sh_name} = $name; + + $self->{sections}{$name} = dup(%coff_shdr); + } + + return $self; + } + + sub Lookup { + my $self = shift; + my $name = shift; + return $self->{sections}{$name}; + } +} + +###################################################################### +# +# main() +# +my $legacy_mode; + +if ($#ARGV<0 || ($#ARGV>0 && !($legacy_mode=(@ARGV[0] =~ /^\-(dso|exe)$/)))) { + print STDERR "usage: $0 [-dso|-exe] pe-coff-binary\n"; + exit(1); +} + +$exe = PECOFF->Load(@ARGV[$#ARGV]); + +sysopen(FD,@ARGV[$#ARGV],$legacy_mode?0:2) or die "$!"; # 2 is read/write +binmode(FD); + +sub FIPS_incore_fingerprint { + my $ctx = HMAC->Init("etaonrishdlcupfm"); + my ($beg,$end); + my $sect; + + $sect = $exe->Lookup("fipstx") or die "no fipstx section"; + + seek(FD,$sect->{sh_offset},0) or die "$!"; + read(FD,$blob,$sect->{sh_vsize}) or die "$!"; + + ($beg = index($blob,"SPIFxet_ts_tXtra")) >= 0 + or die "no FIPS_text_startX"; + ($end = rindex($blob,"SPIFxet_ne_t][Xd")) >= 0 + or die "no FIPS_text_endX"; + + $ctx->Update(substr($blob,$beg,$end-$beg)); + + $sect = $exe->Lookup("fipsro") or die "no fipsro section"; + + seek(FD,$sect->{sh_offset},0) or die "$!"; + read(FD,$blob,$sect->{sh_vsize}) or die "$!"; + + ($beg = index($blob,"SPIFdor__atarats",40)) >= 0 + or die "no FIPS_rodata_start"; + ($end = rindex($blob,"SPIFdor__ata[dne")) >= 0 + or die "no FIPS_rodata_end"; + + $ctx->Update(substr($blob,$beg,$end-$beg)); + + return $ctx->Final(); +} + +$fingerprint = FIPS_incore_fingerprint(); + +if ($legacy_mode) { + print unpack("H*",$fingerprint); +} else { + my $sect = $exe->Lookup("fipsro"); + seek(FD,$sect->{sh_offset},0) or die "$!"; + print FD unpack("H*",$fingerprint) or die "$!"; +} + +close (FD); diff --git a/util/pl/TI_CGTOOLS.pl b/util/pl/TI_CGTOOLS.pl new file mode 100644 index 0000000000..d12d318062 --- /dev/null +++ b/util/pl/TI_CGTOOLS.pl @@ -0,0 +1,274 @@ +#!/usr/local/bin/perl +# +# TI_CGTOOLS.pl, Texas Instruments CGTOOLS under Unix or MSYS. +# + +$ssl= "ssl"; +$crypto="crypto"; + +if ($fips && !$shlib) + { + $crypto="fips"; + $crypto_compat = "cryptocompat.lib"; + } +else + { + $crypto="crypto"; + } + +if ($fipscanisterbuild) + { + $fips_canister_path = "\$(LIB_D)/fipscanister.obj"; + } + +$o='/'; +$cp='cp'; +$cp2='$(PERL) util/copy.pl -stripcr'; +$mkdir='$(PERL) util/mkdir-p.pl'; +$rm='rm -f'; + +$zlib_lib="zlib1.lib"; + +# Santize -L options for ms link +$l_flags =~ s/-L("\[^"]+")/\/libpath:$1/g; +$l_flags =~ s/-L(\S+)/\/libpath:$1/g; + +# C compiler stuff +$cc='cl6x'; +$base_cflags= " $mf_cflag"; +my $f; +$opt_cflags=''; +$dbg_cflags=$f.' -g -DDEBUG -D_DEBUG'; +$lflags=''; + +*::cc_compile_target = sub { + my ($target,$source,$ex_flags)=@_; + my $ret; + + $ex_flags.=" -DMK1MF_BUILD" if ($source =~/cversion/); + $ret ="$target: \$(SRC_D)$o$source\n\t"; + if ($fipscanisterbuild && $source=~/\.asm$/) { + $ret.="\$(PERL) util${o}fipsas.pl . \$< norunasm \$(CFLAG)\n\t"; + } + $ret.="\$(CC) --obj_directory=\$(OBJ_D) $ex_flags -c \$(SRC_D)$o$source\n"; + $target =~ s/.*${o}([^${o}]+)/$1/; + $source =~ s/.*${o}([^${o}\.]+)\..*/$1${obj}/; + $ret.="\tmv \$(OBJ_D)${o}$source \$(OBJ_D)${o}$target\n" if ($target ne $source); + $ret.="\n"; + return($ret); +}; +*::perlasm_compile_target = sub { + my ($target,$source,$bname)=@_; + my $ret; + + $bname =~ s/(.*)\.[^\.]$/$1/; + $ret=<<___; +\$(TMP_D)$o$bname.asm: $source + \$(PERL) $source \$\@ +___ + $ret .= "\t\$(PERL) util${o}fipsas.pl . \$@ norunasm \$(CFLAG)\n" if $fipscanisterbuild; + + $ret.=<<___; + +$target: \$(TMP_D)$o$bname.asm + \$(ASM) --obj_directory=\$(OBJ_D) \$(TMP_D)$o$bname.asm + +___ +}; + +$mlflags=''; + +$out_def ="c6x"; +$tmp_def ="$out_def/tmp"; +$inc_def="$out_def/inc"; + +if ($debug) + { + $cflags=$dbg_cflags.$base_cflags; + } +else + { + $cflags=$opt_cflags.$base_cflags; + } + +$obj='.obj'; +$asm_suffix='.asm'; +$ofile=""; + +# EXE linking stuff +$link='$(CC) -z'; +$efile="-o "; +$exep='.out'; +$ex_libs=''; + +# static library stuff +$mklib='ar6x'; +$ranlib=''; +$plib=""; +$libp=".lib"; +$shlibp=($shlib)?".dll":".lib"; +$lfile='-o '; + +$shlib_ex_obj=""; +$asm='$(CC) $(CFLAG) -c'; + +$bn_asm_obj=''; +$bn_asm_src=''; +$des_enc_obj=''; +$des_enc_src=''; +$bf_enc_obj=''; +$bf_enc_src=''; + +if (!$no_asm) + { + import_asm($mf_bn_asm, "bn", \$bn_asm_obj, \$bn_asm_src); + import_asm($mf_aes_asm, "aes", \$aes_asm_obj, \$aes_asm_src); + import_asm($mf_des_asm, "des", \$des_enc_obj, \$des_enc_src); + import_asm($mf_bf_asm, "bf", \$bf_enc_obj, \$bf_enc_src); + import_asm($mf_cast_asm, "cast", \$cast_enc_obj, \$cast_enc_src); + import_asm($mf_rc4_asm, "rc4", \$rc4_enc_obj, \$rc4_enc_src); + import_asm($mf_rc5_asm, "rc5", \$rc5_enc_obj, \$rc5_enc_src); + import_asm($mf_md5_asm, "md5", \$md5_asm_obj, \$md5_asm_src); + import_asm($mf_sha_asm, "sha", \$sha1_asm_obj, \$sha1_asm_src); + import_asm($mf_rmd_asm, "ripemd", \$rmd160_asm_obj, \$rmd160_asm_src); + import_asm($mf_wp_asm, "whrlpool", \$whirlpool_asm_obj, \$whirlpool_asm_src); + import_asm($mf_modes_asm, "modes", \$modes_asm_obj, \$modes_asm_src); + import_asm($mf_cpuid_asm, "", \$cpuid_asm_obj, \$cpuid_asm_src); + $perl_asm = 1; + } + +sub do_lib_rule + { + my($objs,$target,$name,$shlib,$ign,$base_addr) = @_; + local($ret); + + $taget =~ s/\//$o/g if $o ne '/'; + my $base_arg; + if ($base_addr ne "") + { + $base_arg= " /base:$base_addr"; + } + else + { + $base_arg = ""; + } + if ($name ne "") + { + $name =~ tr/a-z/A-Z/; + $name = "/def:ms/${name}.def"; + } + +# $target="\$(LIB_D)$o$target"; +# $ret.="$target: $objs\n"; + if (!$shlib) + { +# $ret.="\t\$(RM) \$(O_$Name)\n"; + $ret.="$target: $objs\n"; + $ret.="\t\$(MKLIB) $lfile$target $objs\n"; + } + else + { + local($ex)=($target =~ /O_CRYPTO/)?'':' $(L_CRYPTO)'; + $ex.=" $zlib_lib" if $zlib_opt == 1 && $target =~ /O_CRYPTO/; + + if ($fips && $target =~ /O_CRYPTO/) + { + $ret.="$target: $objs \$(PREMAIN_DSO_EXE)"; + $ret.="\n\tFIPS_LINK=\"\$(LINK)\" \\\n"; + $ret.="\tFIPS_CC=\$(CC)\\\n"; + $ret.="\tFIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\\\n"; + $ret.="\tPREMAIN_DSO_EXE=\$(PREMAIN_DSO_EXE)\\\n"; + $ret.="\tFIPS_SHA1_EXE=\$(FIPS_SHA1_EXE)\\\n"; + $ret.="\tFIPS_TARGET=$target\\\n"; + $ret.="\tFIPSLIB_D=\$(FIPSLIB_D)\\\n"; + $ret.="\t\$(FIPSLINK) \$(MLFLAGS) /map $base_arg $efile$target "; + $ret.="$name \$(SHLIB_EX_OBJ) $objs \$(EX_LIBS) "; + $ret.="\$(OBJ_D)${o}fips_premain.obj $ex\n"; + } + else + { + $ret.="$target: $objs"; + $ret.="\n\t\$(LINK) \$(MLFLAGS) $efile$target $name \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n"; + } + + $ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;2\n\n"; + } + $ret.="\n"; + return($ret); + } + +sub do_link_rule + { + my($target,$files,$dep_libs,$libs,$standalone)=@_; + local($ret,$_); + $file =~ s/\//$o/g if $o ne '/'; + $n=&bname($targer); + $ret.="$target: $files $dep_libs\n"; + if ($standalone == 1) + { + $ret.=" \$(LINK) \$(LFLAGS) $efile$target "; + $ret.= "\$(EX_LIBS) " if ($files =~ /O_FIPSCANISTER/ && !$fipscanisterbuild); + $ret.="$files $libs\n"; + } + elsif ($standalone == 2) + { + $ret.="\t\$(LINK) \$(LFLAGS) $efile$target $files \$(O_FIPSCANISTER) $out_def/application.cmd\n"; + $ret.="\t$out_def/incore6x $target\n\n"; + } + else + { + $ret.="\t\$(LINK) \$(LFLAGS) $efile$target "; + $ret.="\t\$(APP_EX_OBJ) $files $libs\n"; + } + return($ret); + } + +sub do_rlink_rule + { + local($target,$rl_start, $rl_mid, $rl_end,$dep_libs,$libs)=@_; + local($ret,$_); + my $files = "$rl_start $rl_mid $rl_end"; + + $file =~ s/\//$o/g if $o ne '/'; + $n=&bname($target); + $ret.="$target: $files $dep_libs\n"; + $ret.="\t\$(LINK) -r $lfile$target $files $out_def/fipscanister.cmd\n"; + $ret.="\t\$(PERL) $out_def${o}fips_standalone_sha1 $target > ${target}.sha1\n"; + $ret.="\t\$(PERL) util${o}copy.pl -stripcr fips${o}fips_premain.c \$(LIB_D)${o}fips_premain.c\n"; + $ret.="\t\$(CP) fips${o}fips_premain.c.sha1 \$(LIB_D)${o}fips_premain.c.sha1\n"; + $ret.="\n"; + return($ret); + } + +sub import_asm + { + my ($mf_var, $asm_name, $oref, $sref) = @_; + my $asm_dir; + if ($asm_name eq "") + { + $asm_dir = "crypto$o"; + } + else + { + $asm_dir = "crypto$o$asm_name$oasm$o"; + } + + $$oref = ""; + $$sref = ""; + $mf_var =~ s/\.o//g; + + foreach (split(/ /, $mf_var)) + { + $$sref .= $asm_dir . $_ . ".asm "; + } + foreach (split(/ /, $mf_var)) + { + $$oref .= "\$(TMP_D)\\" . $_ . ".obj "; + } + $$oref =~ s/ $//; + $$sref =~ s/ $//; + + } + + +1; diff --git a/util/pl/VC-32.pl b/util/pl/VC-32.pl index e98eb1e1b9..db9113fcbb 100644 --- a/util/pl/VC-32.pl +++ b/util/pl/VC-32.pl @@ -49,8 +49,7 @@ if ($FLAVOR =~ /WIN64/) # considered safe to ignore. # $base_cflags= " $mf_cflag"; - my $f = $shlib?' /MD':' /MT'; - $lib_cflag='/Zl' if (!$shlib); # remove /DEFAULTLIBs from static lib + my $f = ($shlib and !$fipscanisterbuild)?' /MD':' /MT'; $opt_cflags=$f.' /Ox'; $dbg_cflags=$f.'d /Od -DDEBUG -D_DEBUG'; $lflags="/nologo /subsystem:console /opt:ref"; @@ -123,23 +122,28 @@ elsif ($FLAVOR =~ /CE/) } $cc='$(CC)'; - $base_cflags=' /W3 /WX /GF /Gy /nologo -DUNICODE -D_UNICODE -DOPENSSL_SYSNAME_WINCE -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DDSO_WIN32 -DNO_CHMOD -DOPENSSL_SMALL_FOOTPRINT'; + $base_cflags=' /W3 /GF /Gy /nologo -DUNICODE -D_UNICODE -DOPENSSL_SYSNAME_WINCE -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DDSO_WIN32 -DNO_CHMOD -DOPENSSL_SMALL_FOOTPRINT'; $base_cflags.=" $wcecdefs"; $base_cflags.=' -I$(WCECOMPAT)/include' if (defined($ENV{'WCECOMPAT'})); $base_cflags.=' -I$(PORTSDK_LIBPATH)/../../include' if (defined($ENV{'PORTSDK_LIBPATH'})); - $opt_cflags=' /MC /O1i'; # optimize for space, but with intrinsics... - $dbg_clfags=' /MC /Od -DDEBUG -D_DEBUG'; + if (`cl 2>&1` =~ /Version 1[4-9]\./) { + $base_cflags.=($shlib and !$fipscanisterbuild)?' /MD':' /MT'; + } else { + $base_cflags.=' /MC'; + } + $opt_cflags=' /O1i'; # optimize for space, but with intrinsics... + $dbg_cflags=' /Od -DDEBUG -D_DEBUG'; $lflags="/nologo /opt:ref $wcelflag"; } else # Win32 { $base_cflags= " $mf_cflag"; - my $f = $shlib?' /MD':' /MT'; - $lib_cflag='/Zl' if (!$shlib); # remove /DEFAULTLIBs from static lib + my $f = ($shlib and !$fipscanisterbuild)?' /MD':' /MT'; $opt_cflags=$f.' /Ox /O2 /Ob2'; $dbg_cflags=$f.'d /Od -DDEBUG -D_DEBUG'; $lflags="/nologo /subsystem:console /opt:ref"; } +$lib_cflag='/Zl' if (!$shlib or $fipscanisterbuild); # remove /DEFAULTLIBs $mlflags=''; $out_def ="out32"; $out_def.="dll" if ($shlib); @@ -174,12 +178,12 @@ $rsc="rc"; $efile="/out:"; $exep='.exe'; if ($no_sock) { $ex_libs=''; } -elsif ($FLAVOR =~ /CE/) { $ex_libs='winsock.lib'; } +elsif ($FLAVOR =~ /CE/) { $ex_libs='ws2.lib'; } else { $ex_libs='ws2_32.lib'; } if ($FLAVOR =~ /CE/) { - $ex_libs.=' $(WCECOMPAT)/lib/wcecompatex.lib' if (defined($ENV{'WCECOMPAT'})); + $ex_libs.=' $(WCECOMPAT)/lib/wcecompatex.lib crypt32.lib coredll.lib corelibc.lib' if (defined($ENV{'WCECOMPAT'})); $ex_libs.=' $(PORTSDK_LIBPATH)/portlib.lib' if (defined($ENV{'PORTSDK_LIBPATH'})); $ex_libs.=' /nodefaultlib:oldnames.lib coredll.lib corelibc.lib' if ($ENV{'TARGETCPU'} eq "X86"); } @@ -284,7 +288,8 @@ elsif ($shlib && $FLAVOR =~ /CE/) { $mlflags.=" $lflags /dll"; $lflags.=' /entry:mainCRTstartup' if(defined($ENV{'PORTSDK_LIBPATH'})); - $lib_cflag.=" -D_WINDLL -D_DLL"; + $lib_cflag.=" -D_WINDLL"; + $lib_cflag.=" -D_DLL" if (!$fipscanisterbuild); } sub do_lib_rule @@ -389,8 +394,9 @@ sub do_rlink_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($targer); - $ret.="$target: $files $dep_libs \$(FIPS_SHA1_EXE)\n"; - $ret.="\t\$(PERL) ms\\segrenam.pl \$\$a $rl_start\n"; + $ret.="$target: $files $dep_libs"; + $ret.=" \$(FIPS_SHA1_EXE)" unless defined $ENV{"FIPS_SHA1_PATH"}; + $ret.="\n\t\$(PERL) ms\\segrenam.pl \$\$a $rl_start\n"; $ret.="\t\$(PERL) ms\\segrenam.pl \$\$b $rl_mid\n"; $ret.="\t\$(PERL) ms\\segrenam.pl \$\$c $rl_end\n"; $ret.="\t\$(MKLIB) $lfile$target @<<\n\t$files\n<<\n"; diff --git a/util/point.sh b/util/point.sh index da39899cb1..22daf0e8c5 100755 --- a/util/point.sh +++ b/util/point.sh @@ -1,7 +1,7 @@ #!/bin/sh rm -f "$2" -if test "$OSTYPE" = msdosdjgpp || test "x$PLATFORM" = xmingw ; then +if test "$OSTYPE" = msdosdjgpp || test "x$PLATFORM" = xmingw || test "x$OS" = xWindows_NT ; then cp "$1" "$2" else ln -s "$1" "$2"