diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..ad8728ec2a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,70 @@
+# Object files
+*.o
+
+# Top level excludes
+/Makefile.bak
+/Makefile
+/*.a
+/include
+/*.pc
+/rehash.time
+
+# Most *.c files under test/ are symlinks
+/test/*.c
+# Apart from these
+!/test/asn1test.c
+!/test/methtest.c
+!/test/dummytest.c
+!/test/igetest.c
+!/test/r160test.c
+!/test/fips_algvs.c
+
+# Certificate symbolic links
+*.0
+
+# Links under apps
+/apps/CA.pl
+/apps/md4.c
+
+
+# Auto generated headers
+/crypto/buildinf.h
+/crypto/opensslconf.h
+
+# Auto generated assembly language source files
+*.s
+!/crypto/bn/asm/pa-risc2.s
+!/crypto/bn/asm/pa-risc2W.s
+
+# Executables
+/apps/openssl
+/test/sha256t
+/test/sha512t
+/test/*test
+/test/fips_aesavs
+/test/fips_desmovs
+/test/fips_dhvs
+/test/fips_drbgvs
+/test/fips_dssvs
+/test/fips_ecdhvs
+/test/fips_ecdsavs
+/test/fips_rngvs
+/test/fips_test_suite
+*.so*
+*.dylib*
+*.dll*
+# Exceptions
+!/test/bctest
+!/crypto/des/times/486-50.sol
+
+# Misc auto generated files
+/tools/c_rehash
+/test/evptests.txt
+lib
+Makefile.save
+*.bak
+# FIPS module specific files.
+/fips/fips_auth.h
+/fips/fips_standalone_sha1
+/fips/fipscanister.o.sha1
+
diff --git a/CHANGES b/CHANGES
index 0d70e034da..8c8a09ce2b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,47 @@
 
  Changes between 1.0.1 and 1.1.0  [xx XXX xxxx]
 
+  *) Add perl scripts to calculate FIPS signatures for Windows
+     exectuables including WinCE. 
+     [Andy Polyakov]
+
+  *) Don't attempt to insert current time into AES/3DES tests, we should
+     be just copying input line across and this breaks some systems lacking
+     ctime. 
+     [Steve Henson]
+
+  *) Update Windows build system for FIPS. Don't compile algorithm test
+     utilties by default: the target build_tests is needed for that. Add
+     support for building fips_algvs with the build_algvs target.
+     [Steve Henson]
+
+  *) Add initial cross compilation support for Windows build. The following
+     environment variables should be set:
+
+     FIPS_SHA1_PATH: path to fips_standalone_sha1 exectutable which will
+     be used explicitly and not built.
+     FIPS_SIG: similar to other builds: path to a "get signature" script
+     which is used to obtain the signature of the target instead of
+     executing it on the host.
+     [Steve Henson]
+
+  *) Add flag to EC_KEY to use cofactor ECDH if set.
+     [Steve Henson]
+
+  *) Update fips_test_suite to support multiple command line options. New
+     test to induce all self test errors in sequence and check expected
+     failures.
+     [Steve Henson]
+
+  *) Add FIPS_{rsa,dsa,ecdsa}_{sign,verify} functions which digest and
+     sign or verify all in one operation.
+     [Steve Henson]
+
+  *) Add fips_algvs: a multicall fips utility incorporaing all the algorithm
+     test programs and fips_test_suite. Includes functionality to parse
+     the minimal script output of fipsalgest.pl directly.
+     [Steve Henson]
+
   *) Add authorisation parameter to FIPS_module_mode_set().
      [Steve Henson]
 
diff --git a/Configure b/Configure
index 10ef8cd115..679252e415 100755
--- a/Configure
+++ b/Configure
@@ -132,14 +132,17 @@ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
-my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+# EXTREME: original asm spec was missing colon and final term.
+#my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::::void";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
 my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
-my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::";
+my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
 my $no_asm=":::::::::::::::void";
 
 # As for $BSDthreads. Idea is to maintain "collective" set of flags,
@@ -341,6 +344,8 @@ my %table=(
 # *-generic* is endian-neutral target, but ./config is free to
 # throw in -D[BL]_ENDIAN, whichever appropriate...
 "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+#### Extreme add linux-mips32be
+"linux-mips32be","gcc:-DB_ENDIAN -DTERMIO -O3 -march=mips32 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips32_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ppc",	"gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc32_asm}:linux32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # It's believed that majority of ARM toolchains predefine appropriate -march.
 # If you compiler does not, do complement config command line with one!
@@ -401,7 +406,8 @@ my %table=(
 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "android-x86","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:".eval{my $asm=${x86_elf_asm};$asm=~s/:elf/:android/;$asm}.":dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-pie%-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"android64-aarch64","gcc:-mandroid -fPIC -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -Wall::-D_REENTRANT::-pie%-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 
 #### *BSD [do see comment about ${BSDthreads} above!]
 "BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -409,6 +415,8 @@ my %table=(
 "BSD-x86-elf",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "debug-BSD-x86-elf",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall -g::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "BSD-sparcv8",	"gcc:-DB_ENDIAN -DTERMIOS -O3 -mv8 -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${sparcv8_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"BSD-ppc85xx","gcc:-DTERMIOS -O3 -fomit-frame-pointer -msoft-float -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debug-BSD-ppc85xx","gcc:-DTERMIOS -O0 -fomit-frame-pointer -msoft-float -Wall -g::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 
 "BSD-generic64","gcc:-DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # -DMD32_REG_T=int doesn't actually belong in sparc64 target, it
@@ -461,8 +469,8 @@ my %table=(
 "aix64-gcc","gcc:-maix64 -O -DB_ENDIAN::-pthread:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-maix64 -shared -Wl,-G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X64",
 # Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
 # at build time. $OBJECT_MODE is respected at ./config stage!
-"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-q32 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
-"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-q64 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
+"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded -D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR:${ppc32_asm}:aix32:dlfcn:aix-shared::-q32 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
+"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst::-qthreaded -D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR:${ppc64_asm}:aix64:dlfcn:aix-shared::-q64 -G:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
 
 #
 # Cray T90 and similar (SDSC)
@@ -578,6 +586,24 @@ my %table=(
 "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "darwin64-x86_64-cc","cc:-arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+# iPhoneOS/iOS
+#
+# It takes three prior-set environment variables to make it work:
+#
+# CROSS_COMPILE=/where/toolchain/is/usr/bin/ [note ending slash]
+# CROSS_TOP=/where/SDKs/are
+# CROSS_SDK=iPhoneOSx.y.sdk
+#
+# Exact paths vary with Xcode releases, but for couple of last ones
+# they would look like this:
+#
+# CROSS_COMPILE=`xcode-select --print-path`/Toolchains/XcodeDefault.xctoolchain/usr/bin/
+# CROSS_TOP=`xcode-select --print-path`/Platforms/iPhoneOS.platform/Developer
+# CROSS_SDK=iPhoneOS7.0.sdk
+#
+"iphoneos-cross","cc:-O3 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fomit-frame-pointer -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"ios-cross","cc:-O3 -arch armv7 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:armcap.o armv4cpuid_ios.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::ios32:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"ios64-cross","cc:-O3 -arch arm64 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR -RC4_CHUNK DES_INT DES_UNROLL -BF_PTR:${aarch64_asm}:ios64:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 
 ##### A/UX
 "aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
@@ -594,6 +620,7 @@ my %table=(
 ##### VxWorks for various targets
 "vxworks-ppc60x","ccppc:-D_REENTRANT -mrtp -mhard-float -mstrict-align -fno-implicit-fp -DPPC32_fp60x -O2 -fstrength-reduce -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/common:::::",
 "vxworks-ppcgen","ccppc:-D_REENTRANT -mrtp -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/sfcommon:::::",
+"vxworks-ppcgen-kernel","ccppc:-D_REENTRANT -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/h -I\$(WIND_BASE)/target/h/wrn/coreip:::VXWORKS::::::",
 "vxworks-ppc405","ccppc:-g -msoft-float -mlongcall -DCPU=PPC405 -I\$(WIND_BASE)/target/h:::VXWORKS:-r:::::",
 "vxworks-ppc750","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h \$(DEBUG_FLAG):::VXWORKS:-r:::::",
 "vxworks-ppc750-debug","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -DDEBUG -g:::VXWORKS:-r:::::",
@@ -608,12 +635,15 @@ my %table=(
 "uClinux-dist","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):BN_LLONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
 "uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
 
+"c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:",
+"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:",
+
 );
 
 my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A
 		    debug-VC-WIN64I debug-VC-WIN64A
 		    VC-NT VC-CE VC-WIN32 debug-VC-WIN32
-		    BC-32 
+		    BC-32 c64xplus c64x
 		    netware-clib netware-clib-bsdsock
 		    netware-libc netware-libc-bsdsock);
 
@@ -906,6 +936,7 @@ EOF
 				}
 			elsif (/^-[^-]/ or /^\+/)
 				{
+				$_ =~ s/%([0-9a-f]{1,2})/chr(hex($1))/gei;
 				$flags.=$_." ";
 				}
 			elsif (/^--prefix=(.*)$/)
@@ -1553,7 +1584,7 @@ if ($rmd160_obj =~ /\.o$/)
 	}
 if ($aes_obj =~ /\.o$/)
 	{
-	$cflags.=" -DAES_ASM";
+	 $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);
 	# aes_ctr.o is not a real file, only indication that assembler
 	# module implements AES_ctr32_encrypt...
 	$cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes_ctr\.o//);
@@ -1574,7 +1605,7 @@ else	{
 	$wp_obj="wp_block.o";
 	}
 $cmll_obj=$cmll_enc	unless ($cmll_obj =~ /.o$/);
-if ($modes_obj =~ /ghash/)
+if ($modes_obj =~ /ghash\-/)
 	{
 	$cflags.=" -DGHASH_ASM";
 	}
diff --git a/Makefile.fips b/Makefile.fips
index 703c9f9228..74db06574b 100644
--- a/Makefile.fips
+++ b/Makefile.fips
@@ -186,7 +186,7 @@ SHARED_LDFLAGS=
 GENERAL=        Makefile
 BASENAME=       openssl
 NAME=           $(BASENAME)-$(VERSION)
-TARFILE=        openssl-fips-2.0-test.tar
+TARFILE=        openssl-fips-2.0.tar
 WTARFILE=       $(NAME)-win.tar
 EXHEADER=       e_os2.h
 HEADER=         e_os.h
@@ -387,6 +387,8 @@ build_apps:
 	@dir=apps; target=all; $(BUILD_ONE_CMD)
 build_tests:
 	@dir=test; target=fipsexe; $(BUILD_ONE_CMD)
+build_algvs:
+	@dir=test; target=fipsalgvs; $(BUILD_ONE_CMD)
 build_tools:
 	@dir=tools; target=all; $(BUILD_ONE_CMD)
 
@@ -522,8 +524,8 @@ files:
 links:
 	@$(PERL) $(TOP)/util/mkdir-p.pl include/openssl
 	@$(PERL) $(TOP)/util/mklink.pl include/openssl $(EXHEADER)
-	@set -e; dir=fips target=links; $(RECURSIVE_BUILD_CMD)
-	@(cd crypto ; SDIRS='$(LINKDIRS)' $(MAKE) -e links)
+	@set -e; dir=fips target=links; $(BUILD_ONE_CMD)
+	@(cd crypto ; TEST='' SDIRS='$(LINKDIRS)' $(MAKE) -e links)
 
 gentests:
 	@(cd test && echo "generating dummy tests (if needed)..." && \
@@ -536,9 +538,7 @@ dclean:
 test:   tests
 
 tests:
-	@(cd test && echo "testing..." && \
-	$(CLEARENV) && $(MAKE) -e $(BUILDENV) TOP=.. TESTS='$(TESTS)' OPENSSL_DEBUG_MEMORY=on OPENSSL_CONF=../apps/openssl.cnf tests );
-	OPENSSL_CONF=apps/openssl.cnf util/opensslwrap.sh version -a
+	@echo "Not implemented in FIPS build" ; false
 
 report:
 	@$(PERL) util/selftest.pl
diff --git a/README.FIPS b/README.FIPS
index c41bab9930..87253f6bfb 100644
--- a/README.FIPS
+++ b/README.FIPS
@@ -1,4 +1,4 @@
-Preliminary status and build information for FIPS module v2.0
+Preliminary status and build information for FIPS module v2.0 
 
 NB: if you are cross compiling you now need to use the latest "incore" script
 this can be found at util/incore in the tarballs.
diff --git a/README.wishlist b/README.wishlist
new file mode 100644
index 0000000000..111ee3ce75
--- /dev/null
+++ b/README.wishlist
@@ -0,0 +1,31 @@
+A "wish list" of changes we'd like to make to the FIPS module if we could.
+Note the CMVP requires retesting of all previously tested platforms
+("Operational Environments") to implement any changes considered "cryptographically
+significant". Since the OpenSSL FIPS module v2.0 has some 250 such formally
+tested platforms (and counting), retesting just isn't logistically or economically
+feasible.
+
+--------
+https://github.com/openssl/openssl/pull/4157
+From 2017-08-14, Fix GCM MAC computation for AES-GCM by	srahul123
+cryptographically significant, not fixable
+
+--------
+Andy Polyakov: harmonize with __thumb__ clause in FIPS_ref_point() (#3354),
+https://patch-diff.githubusercontent.com/raw/openssl/openssl/pull/3354.patch
+https://github.com/openssl/openssl/pull/3354#pullrequestreview-36086406
+May be possible to introduce in future change letter
+
+--------
+CVE-2016-0701
+cryptographically significant, not fixable
+
+--------
+CVE-2014-0076
+cryptographically significant, not fixable
+
+--------
+"Lucky 13", CVE-2013-0169
+cryptographically significant, not fixable
+
+--------
diff --git a/TABLE b/TABLE
index c15ac01cb4..8bdf72045d 100644
--- a/TABLE
+++ b/TABLE
@@ -862,7 +862,7 @@ $multilib     =
 $cc           = cc
 $cflags       = -q32 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst
 $unistd       = 
-$thread_cflag = -qthreaded
+$thread_cflag = -qthreaded -D_THREAD_SAFE
 $sys_id       = AIX
 $lflags       = 
 $bn_ops       = BN_LLONG RC4_CHAR
@@ -961,7 +961,7 @@ $multilib     =
 $cc           = cc
 $cflags       = -q64 -O -DB_ENDIAN -qmaxmem=16384 -qro -qroconst
 $unistd       = 
-$thread_cflag = -qthreaded
+$thread_cflag = -qthreaded -D_THREAD_SAFE
 $sys_id       = AIX
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR
@@ -3465,6 +3465,73 @@ $ranlib       =
 $arflags      = 
 $multilib     = 
 
+*** ios64-cross
+$cc           = cc
+$cflags       = -O3 -arch arm64 -mios-version-min=7.0.0 -isysroot $(CROSS_TOP)/SDKs/$(CROSS_SDK) -fno-common
+$unistd       = 
+$thread_cflag = -D_REENTRANT
+$sys_id       = iOS
+$lflags       = -Wl,-search_paths_first%
+$bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR -RC4_CHUNK DES_INT DES_UNROLL -BF_PTR
+$cpuid_obj    = 
+$bn_obj       = 
+$ec_obj       = 
+$des_obj      = 
+$aes_obj      = 
+$bf_obj       = 
+$md5_obj      = 
+$sha1_obj     = 
+$cast_obj     = 
+$rc4_obj      = 
+$rmd160_obj   = 
+$rc5_obj      = 
+$wp_obj       = 
+$cmll_obj     = 
+$modes_obj    = 
+$engines_obj  = 
+$perlasm_scheme = void
+$dso_scheme   = dlfcn
+$shared_target= darwin-shared
+$shared_cflag = -fPIC -fno-common
+$shared_ldflag = -dynamiclib
+$shared_extension = .$(SHLIB_MAJOR).$(SHLIB_MINOR).dylib
+$ranlib       = 
+$arflags      = 
+$multilib     = 
+
+*** iphoneos-cross
+$cc           = cc
+$cflags       = -O3 -isysroot $(CROSS_TOP)/SDKs/$(CROSS_SDK) -fomit-frame-pointer -fno-common
+$unistd       = 
+$thread_cflag = -D_REENTRANT
+$sys_id       = iOS
+$lflags       = -Wl,-search_paths_first%
+$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
+$cpuid_obj    = 
+$bn_obj       = 
+$des_obj      = 
+$aes_obj      = 
+$bf_obj       = 
+$md5_obj      = 
+$sha1_obj     = 
+$cast_obj     = 
+$rc4_obj      = 
+$rmd160_obj   = 
+$rc5_obj      = 
+$wp_obj       = 
+$cmll_obj     = 
+$modes_obj    = 
+$engines_obj  = 
+$perlasm_scheme = void
+$dso_scheme   = dlfcn
+$shared_target= darwin-shared
+$shared_cflag = -fPIC -fno-common
+$shared_ldflag = -dynamiclib
+$shared_extension = .$(SHLIB_MAJOR).$(SHLIB_MINOR).dylib
+$ranlib       = 
+$arflags      = 
+$multilib     = 
+
 *** irix-cc
 $cc           = cc
 $cflags       = -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN
diff --git a/c6x/do_fips b/c6x/do_fips
new file mode 100755
index 0000000000..4045e605ce
--- /dev/null
+++ b/c6x/do_fips
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+if ! which cl6x > /dev/null 2>&1; then
+	echo 'fatal: cl6x is not on $PATH'
+	exit 1
+fi
+
+perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine
+perl util/mkfiles.pl > MINFO
+perl util/mk1mf.pl auto > c6x/fips.mak
+make -f c6x/fips.mak
+make -f c6x/fips_algvs.mak
diff --git a/c6x/env b/c6x/env
new file mode 100644
index 0000000000..543d33081e
--- /dev/null
+++ b/c6x/env
@@ -0,0 +1,7 @@
+# MSYS-style PATH
+export PATH=/c/CCStudio_v3.3/c6000/cgtools/bin:/c/Program\ Files/ActivePerl58/bin:$PATH
+
+# Windows-style variables
+export C6X_C_DIR='C:\CCStudio_v3.3\c6000\cgtools\include;C:\CCStudio_v3.3\c6000\cgtools\lib'
+
+export PERL5LIB=C:/CCStudio_v3.3/bin/utilities/ccs_scripting
diff --git a/c6x/fips_algvs.mak b/c6x/fips_algvs.mak
new file mode 100644
index 0000000000..7f67927fbd
--- /dev/null
+++ b/c6x/fips_algvs.mak
@@ -0,0 +1,14 @@
+CC=cl6x
+CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H
+OBJ_D=c6x/tmp
+OUT_D=c6x
+
+all:	$(OUT_D)/fips_algvs.out
+
+$(OBJ_D)/fips_algvs.obj:	test/fips_algvs.c
+	$(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $<
+
+$(OUT_D)/fips_algvs.out:	$(OBJ_D)/fips_algvs.obj $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj
+	$(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/incore6x $@ || rm $@
diff --git a/c6x/fips_standalone_sha1 b/c6x/fips_standalone_sha1
new file mode 100755
index 0000000000..ea2268cb4e
--- /dev/null
+++ b/c6x/fips_standalone_sha1
@@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+#
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+(!@ARV[0] && -f @ARGV[$#ARGV]) || die "usage: $0 [-verify] file";
+
+$verify=shift	if (@ARGV[0] eq "-verify");
+
+sysopen(FD,@ARGV[0],0) || die "$!";
+binmode(FD);
+
+my $ctx = HMAC->Init("etaonrishdlcupfm");
+
+while (read(FD,$blob,4*1024)) { $ctx->Update($blob); }
+
+close(FD);
+
+my $signature = unpack("H*",$ctx->Final());
+
+print "HMAC-SHA1(@ARGV[0])= $signature\n";
+
+if ($verify) {
+	open(FD,"<@ARGV[0].sha1") || die "$!";
+	$line = <FD>;
+	close(FD);
+	exit(0)	if ($line =~ /HMAC\-SHA1\([^\)]*\)=\s*([0-9a-f]+)/i &&
+				$1 eq $signature);
+	die "signature mismatch";
+}
diff --git a/c6x/fipscanister.cmd b/c6x/fipscanister.cmd
new file mode 100644
index 0000000000..a06ee15cb3
--- /dev/null
+++ b/c6x/fipscanister.cmd
@@ -0,0 +1,19 @@
+SECTIONS
+{
+    .text:
+    {
+	*(.fips_text:start)
+	*(.text)
+	*(.const:aes_asm)
+	*(.const:sha_asm)
+	*(.const:des_sptrans)
+	*(.switch)
+	*(.fips_text:end)
+    }
+    .const:
+    {
+	*(.fips_const:start)
+	*(.const)
+	*(.fips_const:end)
+    }
+}
diff --git a/c6x/hmac_sha1.pl b/c6x/hmac_sha1.pl
new file mode 100644
index 0000000000..494f7e8569
--- /dev/null
+++ b/c6x/hmac_sha1.pl
@@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2011 The OpenSSL Project.
+#
+######################################################################
+#
+# SHA1 and HMAC in Perl by <appro@openssl.org>.
+#
+{ package SHA1;
+  use integer;
+
+    {
+    ################################### SHA1 block code generator
+    my @V = ('$A','$B','$C','$D','$E');
+    my $i;
+
+    sub XUpdate {
+      my $ret;
+	$ret="(\$T=\$W[($i-16)%16]^\$W[($i-14)%16]^\$W[($i-8)%16]^\$W[($i-3)%16],\n\t";
+	if ((1<<31)<<1) {
+	    $ret.="    \$W[$i%16]=((\$T<<1)|(\$T>>31))&0xffffffff)\n\t  ";
+	} else {
+	    $ret.="    \$W[$i%16]=(\$T<<1)|((\$T>>31)&1))\n\t  ";
+	}
+    }
+    sub tail {
+      my ($a,$b,$c,$d,$e)=@V;
+      my $ret;
+	if ((1<<31)<<1) {
+	    $ret.="(($a<<5)|($a>>27));\n\t";
+	    $ret.="$b=($b<<30)|($b>>2);	$e&=0xffffffff;	#$b&=0xffffffff;\n\t";
+	} else {
+	    $ret.="(($a<<5)|($a>>27)&0x1f);\n\t";
+	    $ret.="$b=($b<<30)|($b>>2)&0x3fffffff;\n\t";
+	}
+      $ret;
+    }
+    sub BODY_00_15 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=\$W[$i]+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_16_19 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_20_39 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x6ed9eba1+($b^$c^$d)+".tail();
+    }
+    sub BODY_40_59 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x8f1bbcdc+(($b&$c)|(($b|$c)&$d))+".tail();
+    }
+    sub BODY_60_79 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0xca62c1d6+($b^$c^$d)+".tail();
+    }
+
+    my $sha1_impl =
+    'sub block {
+	my $self = @_[0];
+	my @W    = unpack("N16",@_[1]);
+	my ($A,$B,$C,$D,$E,$T) = @{$self->{H}};
+	';
+
+	$sha1_impl.='
+	$A &= 0xffffffff;
+	$B &= 0xffffffff;
+	' if ((1<<31)<<1);
+
+	for($i=0;$i<16;$i++){ $sha1_impl.=BODY_00_15(); unshift(@V,pop(@V)); }
+	for(;$i<20;$i++)    { $sha1_impl.=BODY_16_19(); unshift(@V,pop(@V)); }
+	for(;$i<40;$i++)    { $sha1_impl.=BODY_20_39(); unshift(@V,pop(@V)); }
+	for(;$i<60;$i++)    { $sha1_impl.=BODY_40_59(); unshift(@V,pop(@V)); }
+	for(;$i<80;$i++)    { $sha1_impl.=BODY_60_79(); unshift(@V,pop(@V)); }
+
+	$sha1_impl.='
+	$self->{H}[0]+=$A;	$self->{H}[1]+=$B;	$self->{H}[2]+=$C;
+	$self->{H}[3]+=$D;	$self->{H}[4]+=$E;	}';
+
+    #print $sha1_impl,"\n";
+    eval($sha1_impl);		# generate code
+    }
+
+    sub Init {
+	my $class = shift;	# multiple instances...
+	my $self  = {};
+
+	bless $self,$class;
+	$self->{H} = [0x67452301,0xefcdab89,0x98badcfe,0x10325476,0xc3d2e1f0];
+	$self->{N} = 0;
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	my $msg;
+
+	foreach $msg (@_) {
+	    my $len  = length($msg);
+	    my $num  = length($self->{buf});
+	    my $off  = 0;
+
+	    $self->{N} += $len;
+
+	    if (($num+$len)<64)
+	    {	$self->{buf} .= $msg; next;	}
+	    elsif ($num)
+	    {	$self->{buf} .= substr($msg,0,($off=64-$num));
+		$self->block($self->{buf});
+	    }
+
+	    while(($off+64) <= $len)
+	    {	$self->block(substr($msg,$off,64));
+		$off += 64;
+	    }
+
+	    $self->{buf} = substr($msg,$off);
+	}
+	return $self;
+    }
+
+    sub Final {
+	my $self = shift;
+	my $num  = length($self->{buf});
+
+	$self->{buf} .= chr(0x80); $num++;
+	if ($num>56)
+	{   $self->{buf} .= chr(0)x(64-$num);
+	    $self->block($self->{buf});
+	    $self->{buf}=undef;
+	    $num=0;
+	}
+	$self->{buf} .= chr(0)x(56-$num);
+	$self->{buf} .= pack("N2",($self->{N}>>29)&0x7,$self->{N}<<3);
+	$self->block($self->{buf});
+
+	return pack("N*",@{$self->{H}});
+    }
+
+    sub Selftest {
+	my $hash;
+
+	$hash=SHA1->Init()->Update('abc')->Final();
+	die "SHA1 test#1" if (unpack("H*",$hash) ne 'a9993e364706816aba3e25717850c26c9cd0d89d');
+
+	$hash=SHA1->Init()->Update('abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq')->Final();
+	die "SHA1 test#2" if (unpack("H*",$hash) ne '84983e441c3bd26ebaae4aa1f95129e5e54670f1');
+
+	#$hash=SHA1->Init()->Update('a'x1000000)->Final();
+	#die "SHA1 test#3" if (unpack("H*",$hash) ne '34aa973cd4c4daa4f61eeb2bdbad27316534016f');
+    }
+}
+
+{ package HMAC;
+
+    sub Init {
+	my $class = shift;
+	my $key   = shift;
+	my $self  = {};
+
+	bless $self,$class;
+
+	if (length($key)>64) {
+	    $key = SHA1->Init()->Update($key)->Final();
+	}
+	$key .= chr(0x00)x(64-length($key));
+
+	my @ikey = map($_^=0x36,unpack("C*",$key));
+	($self->{hash} = SHA1->Init())->Update(pack("C*",@ikey));
+	 $self->{okey} = pack("C*",map($_^=0x36^0x5c,@ikey));
+
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	$self->{hash}->Update(@_);
+	return $self;
+    }
+
+    sub Final {
+	my $self  = shift;
+	my $ihash = $self->{hash}->Final();
+	return SHA1->Init()->Update($self->{okey},$ihash)->Final();
+    }
+
+    sub Selftest {
+	my $hmac;
+
+	$hmac = HMAC->Init('0123456789:;<=>?@ABC')->Update('Sample #2')->Final();
+	die "HMAC test" if (unpack("H*",$hmac) ne '0922d3405faa3d194f82a45830737d5cc6c75d24');
+    }
+}
+
+1;
diff --git a/c6x/incore6x b/c6x/incore6x
new file mode 100755
index 0000000000..be73aca2d9
--- /dev/null
+++ b/c6x/incore6x
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2011 The OpenSSL Project.
+#
+# The script embeds fingerprint into TI-COFF executable object.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+######################################################################
+#
+# COFF symbol table parser by <appro@openssl.org>. The table entries
+# are extended with offset within executable file...
+#
+{ package COFF;
+  use FileHandle;
+
+    sub dup  { my %copy=map {$_} @_; return \%copy; }
+
+    sub Load {
+	my $class = shift;
+	my $self  = {};
+	my $FD    = FileHandle->new();	# autoclose
+
+	bless $self,$class;
+
+	sysopen($FD,shift,0) or die "$!";
+	binmode($FD);
+
+	#################################################
+	# read and parse COFF header...
+	#
+	read($FD,my $coff,22) or die "$!";
+
+	my %coff_header;
+	@coff_header{version,nsects,date,syms_off,nsyms,opt,flags,magic}=
+		unpack("v2V3v3",$coff);
+
+	$!=42;		# signal fipsld to revert to two-step link
+	die "not TI-COFF file" if ($coff_header{version} != 0xC2);
+
+	my $big_endian = ($coff_header{flags}>>9)&1;	# 0 or 1
+
+	my $strings;
+	my $symsize;
+
+	#################################################
+	# load strings table
+	#
+	seek($FD,$coff_header{syms_off}+18*$coff_header{nsyms},0) or die "$!";
+	read($FD,$strings,4) or die "$!";
+	$symsize = unpack("V",$strings);
+	read($FD,$strings,$symsize,4) or die "$!";
+
+	#################################################
+	# read sections
+	#
+	my $i;
+	my @sections;
+
+	# seek to section headers
+	seek($FD,22+@coff_header{opt},0) or die "$!";
+	for ($i=0;$i<$coff_header{nsects};$i++) {
+	    my %coff_shdr;
+	    my $name;
+
+	    read($FD,my $section,48) or die "$!";
+
+	    @coff_shdr{sh_name,sh_phaddr,sh_vaddr,
+			sh_size,sh_offset,sh_relocs,sh_reserved,
+			sh_relocoff,sh_lines,sh_flags} =
+		unpack("a8V9",$section);
+
+	    $name = $coff_shdr{sh_name};
+	    # see if sh_name is a an offset in $strings
+	    my ($hi,$lo) = unpack("V2",$name);
+	    if ($hi==0 && $lo<$symsize) {
+		$name = substr($strings,$lo,64);
+	    }
+	    $coff_shdr{sh_name} = (split(chr(0),$name))[0];
+
+	    push(@sections,dup(%coff_shdr));
+	}
+
+	#################################################
+	# load symbols table
+	#
+	seek($FD,$coff_header{syms_off},0) or die "$!";
+	for ($i=0;$i<$coff_header{nsyms};$i++) {
+	    my %coff_sym;
+	    my $name;
+
+	    read($FD,my $blob,18) or die "$!";
+
+	    @coff_sym{st_name,st_value,st_shndx,reserved,class,aux} =
+		unpack("a8Vv2C2",$blob);
+
+	    # skip aux entries
+	    if ($coff_sym{aux}) {
+		seek($FD,18*$coff_sym{aux},1) or die "$!";
+		$i+=$coff_sym{aux};
+	    }
+
+	    $name = $coff_sym{st_name};
+	    # see if st_name is a an offset in $strings
+	    my ($hi,$lo) = unpack("V2",$name);
+	    if ($hi==0 && $lo<$symsize) {
+		$name = substr($strings,$lo,64);
+	    }
+	    $coff_sym{st_name} = $name = (split(chr(0),$name))[0];
+
+	    my $st_secn = $coff_sym{st_shndx}-1;
+	    if ($st_secn>=0 && $st_secn<=$#sections
+		&& @sections[$st_secn]->{sh_offset}
+		&& $name =~ m/^_[a-z]+/i) {
+		# synthesize st_offset, ...
+		$coff_sym{st_offset} = $coff_sym{st_value}
+				- @sections[$st_secn]->{sh_vaddr}
+				+ @sections[$st_secn]->{sh_offset};
+		$coff_sym{st_section} = @sections[$st_secn]->{sh_name};
+		# ... and add to lookup table
+		$self->{symbols}{$name} = dup(%coff_sym);
+	    }
+	}
+
+	return $self;
+    }
+
+    sub Lookup {
+	my $self = shift;
+	my $name = shift;
+	return $self->{symbols}{"_$name"};
+    }
+
+    sub Traverse {
+	my $self = shift;
+	my $code = shift;
+
+	if (ref($code) eq 'CODE') {
+	    for (keys(%{$self->{symbols}})) { &$code($self->{symbols}{$_}); }
+	}
+    }
+}
+
+######################################################################
+#
+# main()
+#
+my $legacy_mode;
+
+if ($#ARGV<0 || ($#ARGV>0 && !($legacy_mode=(@ARGV[0] =~ /^\-(dso|exe)$/)))) {
+	print STDERR "usage: $0 [-dso|-exe] ti-coff-binary\n";
+	exit(1);
+}
+
+$exe = COFF->Load(@ARGV[$#ARGV]);
+
+$FIPS_text_start	= $exe->Lookup("FIPS_text_start")		or die;
+$FIPS_text_end		= $exe->Lookup("FIPS_text_end")			or die;
+$FIPS_rodata_start	= $exe->Lookup("FIPS_rodata_start")		or die;
+$FIPS_rodata_end	= $exe->Lookup("FIPS_rodata_end")		or die;
+$FIPS_signature		= $exe->Lookup("FIPS_signature")		or die;
+
+# new cross-compile support
+$FIPS_text_startX	= $exe->Lookup("FIPS_text_startX");
+$FIPS_text_endX		= $exe->Lookup("FIPS_text_endX");
+
+if (!$legacy_mode) {
+    if (!$FIPS_text_startX || !$FIPS_text_endX) {
+	print STDERR "@ARGV[$#ARGV] is not cross-compiler aware.\n";
+	exit(42);	# signal fipsld to revert to two-step link
+    }
+
+    $FINGERPRINT_ascii_value
+			= $exe->Lookup("FINGERPRINT_ascii_value");
+}
+if ($FIPS_text_startX && $FIPS_text_endX) {
+    $FIPS_text_start = $FIPS_text_startX;
+    $FIPS_text_end   = $FIPS_text_endX;
+}
+
+sysopen(FD,@ARGV[$#ARGV],$legacy_mode?0:2) or die "$!";	# 2 is read/write
+binmode(FD);
+
+sub HMAC_Update {
+  my ($hmac,$off,$len) = @_;
+  my $blob;
+
+    seek(FD,$off,0)	or die "$!";
+    read(FD,$blob,$len)	or die "$!";
+    $$hmac->Update($blob);
+}
+
+# fips/fips.c:FIPS_incore_fingerprint's Perl twin
+#
+sub FIPS_incore_fingerprint {
+  my $p1  = $FIPS_text_start->{st_offset};
+  my $p2  = $FIPS_text_end->{st_offset};
+  my $p3  = $FIPS_rodata_start->{st_offset};
+  my $p4  = $FIPS_rodata_end->{st_offset};
+  my $sig = $FIPS_signature->{st_offset};
+  my $ctx = HMAC->Init("etaonrishdlcupfm");
+
+    # detect overlapping regions
+    if ($p1<=$p3 && $p2>=$p3) {
+	$p3 = $p1; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0;
+    } elsif ($p3<=$p1 && $p4>=$p1) {
+	$p3 = $p3; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0;
+    }
+
+    if ($p1) {
+	HMAC_Update (\$ctx,$p1,$p2-$p1);
+    }
+
+    if ($sig>=$p3 && $sig<$p4) {
+	# "punch" hole
+	HMAC_Update(\$ctx,$p3,$sig-$p3);
+	$p3 = $sig+20;
+	HMAC_Update(\$ctx,$p3,$p4-$p3);
+    } else {
+	HMAC_Update(\$ctx,$p3,$p4-$p3);
+    }
+
+    return $ctx->Final();
+}
+
+$fingerprint = FIPS_incore_fingerprint();
+
+if ($legacy_mode) {
+    print unpack("H*",$fingerprint);
+} elsif ($FINGERPRINT_ascii_value) {
+    seek(FD,$FINGERPRINT_ascii_value->{st_offset},0)	or die "$!";
+    print FD unpack("H*",$fingerprint)			or die "$!";
+} else {
+    seek(FD,$FIPS_signature->{st_offset},0)		or die "$!";
+    print FD $fingerprint				or die "$!";
+}
+
+close (FD);
diff --git a/c6x/run6x b/c6x/run6x
new file mode 100755
index 0000000000..aecfabeb04
--- /dev/null
+++ b/c6x/run6x
@@ -0,0 +1,43 @@
+#!/usr/bin/env perl
+
+$exe  = @ARGV[0];
+$exe .= ".out" if (! -f $exe);
+die if (! -f $exe);
+
+use CCS_SCRIPTING_PERL;
+
+my $studio=new CCS_SCRIPTING_PERL::CCS_Scripting();
+
+$studio->CCSOpenNamed("*","*",1);	# connect to board
+$studio->TargetReset();
+
+print "loading $exe\n";
+$studio->ProgramLoad($exe);
+
+sub write_string {
+    my ($studio,$addr,$str) = @_;
+    my $len = length($str);
+    my $i;
+
+    for ($i=0; $i<$len; $i++) {
+	$studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,vec($str,$i,8));
+    }
+    $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,0);
+
+    return $i+1;
+}
+
+$addr= $studio->SymbolGetAddress("__c_args");
+printf "setting up __c_args at 0x%X\n",$addr;#\n";
+
+$studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr,32,$#ARGV+1);
+
+for ($i=0,$strings=$addr+($#ARGV+3)*4; $i<=$#ARGV; $i++) {
+    $off = write_string($studio,$strings,@ARGV[$i]);
+    $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,$strings);
+    $strings += $off;
+}
+$studio->MemoryWrite($SCC_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,0);
+
+print "running...\n";
+$studio->TargetRun();
diff --git a/c6x/run6x.js b/c6x/run6x.js
new file mode 100755
index 0000000000..6d94949751
--- /dev/null
+++ b/c6x/run6x.js
@@ -0,0 +1,91 @@
+#!/usr/bin/env dss.sh
+//
+// Debug Server Scripting C6x launcher.
+//
+
+importPackage(Packages.com.ti.debug.engine.scripting);
+importPackage(Packages.com.ti.ccstudio.scripting.environment);
+importPackage(Packages.java.lang);
+
+if (arguments.length == 0) {
+    // Extract script name from eclipse
+    var regex = new RegExp("-dss\\.rhinoArgs\n(.*)");
+    var matches = regex.exec(environment["eclipse.commands"]);
+
+    System.err.println("Usage: " + matches[1] + " executable [args]");
+    System.err.println();
+    System.err.println("You're also required to set CCSTARGETCONFIG " +
+                       "environment variable to appoint");
+    System.err.println("proper .ccxml file, customarily one of " +
+                       "$HOME/ti/CCSTargetConfigurations/*.ccxml");
+    quit(1);
+}
+
+try {
+    var prog = arguments[0];
+    var script = ScriptingEnvironment.instance();
+
+    var debugServer = script.getServer("DebugServer.1");
+
+    // CCSTARGETCONFIG environment variable should point at proper .ccxml,
+    // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml.
+    debugServer.setConfig(System.getenv("CCSTARGETCONFIG"));
+
+    var debugSession = debugServer.openSession("*", "*");
+
+    // Redirect GEL output to |prog|.gel file, so that it doesn't clobber
+    // standard output from the program...
+    var dot = prog.lastIndexOf(".");
+    var gel_out = prog + ".gel";
+    if (dot > 0) {
+        gel_out = prog.substr(0,dot) + ".gel";
+    }
+    debugSession.expression.evaluate('GEL_EnableFileOutput("'
+                                      + gel_out + '", 0, 0)');
+
+    debugSession.target.connect();
+
+    // It should be noted that "current working directory" for program
+    // executed on the target system is one where |prog| resides, and
+    // not where script executed [as one would expect]...
+    debugSession.memory.loadProgram(prog, arguments);
+
+    // Pull exit()'s address and set breakpoint, then just execute till
+    // it's reached...
+    var exitAddr = debugSession.symbol.getAddress("exit");
+    debugSession.breakpoint.add(exitAddr);
+
+    while (1) {
+        debugSession.target.run();
+
+        var PC = debugSession.expression.evaluate("PC");
+        if (PC == exitAddr) {
+            break;
+        }
+    }
+
+    // Snatch value passed to exit(), so that it can be passed down to
+    // shell as exit code from this script...
+    var exitCode = debugSession.expression.evaluate("A4");
+
+    // Last run to termination...
+    debugSession.target.run();
+    // Clean up...
+    debugSession.terminate();
+    debugServer.stop();
+
+    // It should be noted that there is kind of a bug in C6x run-time.
+    // Return value from main() is not passed to last implicit exit()
+    // call [as it would on other systems], but instead constant 1 is
+    // passed, which conventionally indicates an error. So that if one
+    // wants to pass specific exit code, or even 0 indicating "success",
+    // one has to call exit() explicitly instead of relying on value
+    // returned by main()...
+    quit(exitCode);
+
+} catch (e) {
+    // We catch everything, because default handler terminates script with
+    // "success" exit code upon exception...
+    System.err.println(e.rhinoException);
+    quit(139);
+}
diff --git a/config b/config
index d2b155aa44..36ab9f2ef6 100755
--- a/config
+++ b/config
@@ -219,7 +219,11 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
 	;;
 
     NetBSD:*:*:*386*)
-        echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0
+	if [ -z ${CROSS_COMPILE} ]; then
+           echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0
+        else
+           echo "${MACHINE}-whatever-netbsd"; exit 0
+	fi
 	;;
 
     NetBSD:*)
@@ -371,6 +375,10 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
        echo "nsr-tandem-nsk"; exit 0;
        ;;
 
+    vxworks:kernel*)
+       echo "${MACHINE}-kernel-vxworks"; exit 0;
+       ;;
+
     vxworks*)
        echo "${MACHINE}-whatever-vxworks"; exit 0;
        ;;
@@ -535,10 +543,14 @@ case "$GUESSOS" in
         #fi
 	OUT="irix-mips3-$CC"
 	;;
+  mips32be-*-linux2)
+	OUT=linux-mips32be
+	options="$options threads shared zlib-dynamic"
+	;;
   ppc-apple-rhapsody) OUT="rhapsody-ppc-cc" ;;
   ppc-apple-darwin*)
 	ISA64=`(sysctl -n hw.optional.64bitops) 2>/dev/null`
-	if [ "$ISA64" = "1" ]; then
+	if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then
 	    echo "WARNING! If you wish to build 64-bit library, then you have to"
 	    echo "         invoke './Configure darwin64-ppc-cc' *manually*."
 	    if [ "$TEST" = "false" -a -t 1 ]; then
@@ -546,10 +558,14 @@ case "$GUESSOS" in
 	      (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
 	    fi
 	fi
-	OUT="darwin-ppc-cc" ;;
+	if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="darwin64-ppc-cc"
+	else
+	    OUT="darwin-ppc-cc"
+	fi ;;
   i?86-apple-darwin*)
 	ISA64=`(sysctl -n hw.optional.x86_64) 2>/dev/null`
-	if [ "$ISA64" = "1" ]; then
+	if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then
 	    echo "WARNING! If you wish to build 64-bit library, then you have to"
 	    echo "         invoke './Configure darwin64-x86_64-cc' *manually*."
 	    if [ "$TEST" = "false" -a -t 1 ]; then
@@ -557,7 +573,21 @@ case "$GUESSOS" in
 	      (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
 	    fi
 	fi
-	OUT="darwin-i386-cc" ;;
+	if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="darwin64-x86_64-cc"
+	else
+	    OUT="darwin-i386-cc"
+	fi ;;
+  armv6+7-*-iphoneos)
+	options="$options -arch%20armv6 -arch%20armv7"
+	OUT="iphoneos-cross" ;;
+  *-*-iphoneos)
+	options="$options -arch%20${MACHINE}"
+	OUT="iphoneos-cross" ;;
+  armv7-*-ios)
+	OUT="ios-cross" ;;
+  arm64-*-ios*)
+	OUT="ios64-cross" ;;
   alpha-*-linux2)
         ISA=`awk '/cpu model/{print$4;exit(0);}' /proc/cpuinfo`
 	case ${ISA:-generic} in
@@ -583,6 +613,7 @@ case "$GUESSOS" in
 	;;
   ppc-*-linux2) OUT="linux-ppc" ;;
   ppc60x-*-vxworks*) OUT="vxworks-ppc60x" ;;
+  ppcgen-kernel-vxworks*) OUT="vxworks-ppcgen-kernel" ;;
   ppcgen-*-vxworks*) OUT="vxworks-ppcgen" ;;
   pentium-*-vxworks*) OUT="vxworks-pentium" ;;
   simlinux-*-vxworks*) OUT="vxworks-simlinux" ;;
@@ -664,7 +695,7 @@ case "$GUESSOS" in
   sun4[uv]*-*-solaris2)
 	OUT="solaris-sparcv9-$CC"
 	ISA64=`(isalist) 2>/dev/null | grep sparcv9`
-	if [ "$ISA64" != "" ]; then
+	if [ "$ISA64" != "" -a "$KERNEL_BITS" = "" ]; then
 	    if [ "$CC" = "cc" -a $CCVER -ge 50 ]; then
 		echo "WARNING! If you wish to build 64-bit library, then you have to"
 		echo "         invoke './Configure solaris64-sparcv9-cc' *manually*."
@@ -694,13 +725,16 @@ case "$GUESSOS" in
 		fi
 	    fi
 	fi
+	if [ "$ISA64" != "" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="solaris64-sparcv9-$CC"
+	fi
 	;;
   sun4m-*-solaris2)	OUT="solaris-sparcv8-$CC" ;;
   sun4d-*-solaris2)	OUT="solaris-sparcv8-$CC" ;;
   sun4*-*-solaris2)	OUT="solaris-sparcv7-$CC" ;;
   *86*-*-solaris2)
 	ISA64=`(isalist) 2>/dev/null | grep amd64`
-	if [ "$ISA64" != "" ]; then
+	if [ "$ISA64" != "" -a ${KERNEL_BITS:-64} -eq 64 ]; then
 	    OUT="solaris64-x86_64-$CC"
 	else
 	    OUT="solaris-x86-$CC"
@@ -717,17 +751,23 @@ case "$GUESSOS" in
   sparc64-*-*bsd*)	OUT="BSD-sparc64" ;;
   ia64-*-*bsd*)		OUT="BSD-ia64" ;;
   amd64-*-*bsd*)	OUT="BSD-x86_64" ;;
-  *86*-*-*bsd*)		# mimic ld behaviour when it's looking for libc...
-			if [ -L /usr/lib/libc.so ]; then	# [Free|Net]BSD
-			    libc=/usr/lib/libc.so
-			else					# OpenBSD
-			    # ld searches for highest libc.so.* and so do we
-			    libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null`
-			fi
-			case "`(file -L $libc) 2>/dev/null`" in
-			*ELF*)	OUT="BSD-x86-elf" ;;
-			*)	OUT="BSD-x86"; options="$options no-sse2" ;;
-			esac ;;
+  *86*-*-*bsd*)		if [ -z ${CROSS_COMPILE} ]; then 
+			   # mimic ld behaviour when it's looking for libc...
+			   if [ -L /usr/lib/libc.so ]; then	# [Free|Net]BSD
+			       libc=/usr/lib/libc.so
+			   else					# OpenBSD
+			       # ld searches for highest libc.so.* and so do we
+			       libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null`
+			   fi
+			   echo "libc = $libc"
+			   case "`(file -L $libc) 2>/dev/null`" in
+			   *ELF*)	OUT="BSD-x86-elf" ;;
+			   *)	OUT="BSD-x86"; options="$options no-sse2" ;;
+			  esac 
+			else
+			   OUT="BSD-x86-elf"
+			fi;;
+  ppc85xx-*-*bsd*)      OUT="BSD-ppc85xx" ;;  # MPC85XX has no hardware FP accelerator
   *-*-*bsd*)		OUT="BSD-generic32" ;;
 
   *-*-osf)		OUT="osf1-alpha-cc" ;;
@@ -825,6 +865,7 @@ case "$GUESSOS" in
   *-*-qnx6) OUT="QNX6" ;;
   x86-*-android|i?86-*-android) OUT="android-x86" ;;
   armv[7-9]*-*-android) OUT="android-armv7" ;;
+  aarch64-*-android) OUT="android64-aarch64" ;;
   *) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;;
 esac
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 22cb2a5013..7304684f76 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -87,6 +87,7 @@ ppccpuid.s:	ppccpuid.pl;	$(PERL) ppccpuid.pl $(PERLASM_SCHEME) $@
 pariscid.s:	pariscid.pl;	$(PERL) pariscid.pl $(PERLASM_SCHEME) $@
 alphacpuid.s:	alphacpuid.pl
 	$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
+arm64cpuid.S:	arm64cpuid.pl;	$(PERL) arm64cpuid.pl $(PERLASM_SCHEME) > $@
 
 subdirs:
 	@target=all; $(RECURSIVE_MAKE)
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index 8edd358bd3..34760d174a 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -71,6 +71,8 @@ aes-sparcv9.s: asm/aes-sparcv9.pl
 
 aes-ppc.s:	asm/aes-ppc.pl
 	$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
+aesp8-ppc.s:	asm/aesp8-ppc.pl
+	$(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@
 
 aes-parisc.s:	asm/aes-parisc.pl
 	$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
@@ -78,6 +80,10 @@ aes-parisc.s:	asm/aes-parisc.pl
 aes-mips.S:	asm/aes-mips.pl
 	$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
 
+aesv8-armx.S:	asm/aesv8-armx.pl
+	$(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
+aesv8-armx.o:	aesv8-armx.S
+
 # GNU make "catch all"
 aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 aes-armv4.o:	aes-armv4.S
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 55b6e04b67..ed5125827b 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -32,8 +32,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 16%
 # improvement on Cortex A8 core and ~21.5 cycles per byte.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $s0="r0";
 $s1="r1";
@@ -171,7 +183,12 @@ AES_encrypt:
 	stmdb   sp!,{r1,r4-r12,lr}
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
+#ifdef	__APPLE__
+	mov	$tbl,#AES_encrypt-AES_Te
+	sub	$tbl,r3,$tbl			@ Te
+#else
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
+#endif
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
@@ -425,7 +442,12 @@ AES_set_encrypt_key:
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
+#ifdef	__APPLE__
+	mov	$tbl,#AES_set_encrypt_key-AES_Te-1024
+	sub	$tbl,r3,$tbl					@ Te4
+#else
 	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+#endif
 
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
@@ -886,7 +908,12 @@ AES_decrypt:
 	stmdb   sp!,{r1,r4-r12,lr}
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
+#ifdef	__APPLE__
+	mov	$tbl,#AES_decrypt-AES_Td
+	sub	$tbl,r3,$tbl				@ Td
+#else
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
+#endif
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
diff --git a/crypto/aes/asm/aes-c64x.pl b/crypto/aes/asm/aes-c64x.pl
new file mode 100644
index 0000000000..0817128c1b
--- /dev/null
+++ b/crypto/aes/asm/aes-c64x.pl
@@ -0,0 +1,1375 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x.
+#
+# Even though loops are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*)	This means that there might be subtle correlation between data
+#	and timing and one can wonder if it can be ... attacked:-(
+#	On the other hand this also means that *if* one chooses to
+#	implement *4* T-tables variant [instead of 1 T-table as in
+#	this implementation, or in addition to], then one ought to
+#	*interleave* them. Even though it complicates addressing,
+#	references to interleaved tables would be guaranteed not to
+#	clash. I reckon that it should be possible to break 8 cycles
+#	per byte "barrier," i.e. improve by ~20%, naturally at the
+#	cost of 8x increased pressure on L1D. 8x because you'd have
+#	to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	AES_encrypt,_AES_encrypt
+	.asg	AES_decrypt,_AES_decrypt
+	.asg	AES_set_encrypt_key,_AES_set_encrypt_key
+	.asg	AES_set_decrypt_key,_AES_set_decrypt_key
+	.asg	AES_ctr32_encrypt,_AES_ctr32_encrypt
+	.endif
+
+	.asg	B3,RA
+	.asg	A4,INP
+	.asg	B4,OUT
+	.asg	A6,KEY
+	.asg	A4,RET
+	.asg	B15,SP
+
+	.eval	24,EXT0
+	.eval	16,EXT1
+	.eval	8,EXT2
+	.eval	0,EXT3
+	.eval	8,TBL1
+	.eval	16,TBL2
+	.eval	24,TBL3
+
+	.if	.BIG_ENDIAN
+	.eval	24-EXT0,EXT0
+	.eval	24-EXT1,EXT1
+	.eval	24-EXT2,EXT2
+	.eval	24-EXT3,EXT3
+	.eval	32-TBL1,TBL1
+	.eval	32-TBL2,TBL2
+	.eval	32-TBL3,TBL3
+	.endif
+
+	.global	_AES_encrypt
+_AES_encrypt:
+	.asmfunc
+	MVK	1,B2
+__encrypt:
+	.if	__TI_EABI__
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.else
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Te-__encrypt),$TEA
+||	ADDKPC	__encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Te-__encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.endif
+	LDW	*$KPA++[2],$Te0[0]		; zero round key
+||	LDW	*$KPB++[2],$Te0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Te
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Te)
+	LDW	*$KPA++[2],$Te0[2]
+||	LDW	*$KPB++[2],$Te0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Te0[0],$s[0],$s[0]
+||	XOR	$Te0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+
+	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+||	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+||	SUB	B0,1,B0
+;;====================================================================
+enc_loop?:
+	LDW	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDW	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDW	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDW	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDW	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDW	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDW	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDW	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDW	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDW	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDW	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDW	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	ROTL	$Te1[1],TBL1,$Te3[0]		; t0
+||	ROTL	$Te3[0],TBL3,$Te1[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDW	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDW	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	ROTL	$Te3[1],TBL3,$Te1[0]		; t2
+||	ROTL	$Te1[0],TBL1,$Te3[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+|| [B0]	SUB	B0,1,B0
+	LDW	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDW	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	ROTL	$Te2[2],TBL2,$Te2[2]		; t0
+||	ROTL	$Te2[3],TBL2,$Te2[3]		; t1
+||	XOR	$K[0],$Te3[0],$s[0]
+||	XOR	$K[1],$Te1[1],$s[1]
+|| [B0]	BNOP	enc_loop?
+	ROTL	$Te3[3],TBL3,$Te1[2]		; t0
+||	ROTL	$Te1[2],TBL1,$Te3[3]		; t1
+||	XOR	$K[2],$Te1[0],$s[2]
+||	XOR	$K[3],$Te3[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Te2[0],TBL2,$Te2[0]		; t2
+||	ROTL	$Te2[1],TBL2,$Te2[1]		; t3
+||	XOR	$s[0],$Te2[2],$s[0]
+||	XOR	$s[1],$Te2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Te1[3],TBL1,$Te3[2]		; t2
+||	ROTL	$Te3[2],TBL3,$Te1[3]		; t3
+||	XOR	$s[0],$Te1[2],$s[0]
+||	XOR	$s[1],$Te3[3],$s[1]
+	XOR	$s[2],$Te2[0],$s[2]
+||	XOR	$s[3],$Te2[1],$s[3]
+||	XOR	$s[0],$Te0[0],$s[0]
+||	XOR	$s[1],$Te0[1],$s[1]
+	XOR	$s[2],$Te3[2],$s[2]
+||	XOR	$s[3],$Te1[3],$s[3]
+||	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+||[!B0]	ADD	${TEA},A0,${TEA}		; point to Te4
+||[!B0]	ADD	${TEB},A0,${TEB}
+;;====================================================================
+	LDBU	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDBU	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDBU	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDBU	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDBU	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDBU	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDBU	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDBU	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDBU	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDBU	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDBU	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDBU	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+	LDBU	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDBU	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDBU	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDBU	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Te0[0],$Te1[1],$Te0[0]
+||	PACK2	$Te0[1],$Te1[2],$Te0[1]
+	PACK2	$Te2[2],$Te3[3],$Te2[2]
+||	PACK2	$Te2[3],$Te3[0],$Te2[3]
+	PACKL4	$Te0[0],$Te2[2],$Te0[0]
+||	PACKL4	$Te0[1],$Te2[3],$Te0[1]
+	XOR	$K[0],$Te0[0],$Te0[0]		; s[0]
+||	XOR	$K[1],$Te0[1],$Te0[1]		; s[1]
+
+	PACK2	$Te0[2],$Te1[3],$Te0[2]
+||	PACK2	$Te0[3],$Te1[0],$Te0[3]
+	PACK2	$Te2[0],$Te3[1],$Te2[0]
+||	PACK2	$Te2[1],$Te3[2],$Te2[1]
+||	BNOP	RA
+	PACKL4	$Te0[2],$Te2[0],$Te0[2]
+||	PACKL4	$Te0[3],$Te2[1],$Te0[3]
+	XOR	$K[2],$Te0[2],$Te0[2]		; s[2]
+||	XOR	$K[3],$Te0[3],$Te0[3]		; s[3]
+
+	MV	$Te0[0],A9
+||	MV	$Te0[1],A8
+	MV	$Te0[2],B9
+||	MV	$Te0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Te1[1],$Te0[0],$Te1[1]
+||	PACK2	$Te1[2],$Te0[1],$Te1[2]
+	PACK2	$Te3[3],$Te2[2],$Te3[3]
+||	PACK2	$Te3[0],$Te2[3],$Te3[0]
+	PACKL4	$Te3[3],$Te1[1],$Te1[1]
+||	PACKL4	$Te3[0],$Te1[2],$Te1[2]
+	XOR	$K[0],$Te1[1],$Te1[1]		; s[0]
+||	XOR	$K[1],$Te1[2],$Te1[2]		; s[1]
+
+	PACK2	$Te1[3],$Te0[2],$Te1[3]
+||	PACK2	$Te1[0],$Te0[3],$Te1[0]
+	PACK2	$Te3[1],$Te2[0],$Te3[1]
+||	PACK2	$Te3[2],$Te2[1],$Te3[2]
+||	BNOP	RA
+	PACKL4	$Te3[1],$Te1[3],$Te1[3]
+||	PACKL4	$Te3[2],$Te1[0],$Te1[0]
+	XOR	$K[2],$Te1[3],$Te1[3]		; s[2]
+||	XOR	$K[3],$Te1[0],$Te1[0]		; s[3]
+
+	MV	$Te1[1],A8
+||	MV	$Te1[2],A9
+	MV	$Te1[3],B8
+||	MV	$Te1[0],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+
+	.global	_AES_decrypt
+_AES_decrypt:
+	.asmfunc
+	MVK	1,B2
+__decrypt:
+	.if	__TI_EABI__
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.else
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Td-__decrypt),$TEA
+||	ADDKPC	__decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Td-__decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	.endif
+	LDW	*$KPA++[2],$Td0[0]		; zero round key
+||	LDW	*$KPB++[2],$Td0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Td
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Td)
+	LDW	*$KPA++[2],$Td0[2]
+||	LDW	*$KPB++[2],$Td0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Td0[0],$s[0],$s[0]
+||	XOR	$Td0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+
+	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+||	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+||	SUB	B0,1,B0
+;;====================================================================
+dec_loop?:
+	LDW	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDW	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDW	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDW	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDW	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDW	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDW	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDW	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDW	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDW	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDW	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDW	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	ROTL	$Td3[1],TBL3,$Td1[0]		; t0
+||	ROTL	$Td1[0],TBL1,$Td3[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDW	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDW	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	ROTL	$Td1[1],TBL1,$Td3[0]		; t2
+||	ROTL	$Td3[0],TBL3,$Td1[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+|| [B0]	SUB	B0,1,B0
+	LDW	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDW	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+||	ROTL	$Td2[2],TBL2,$Td2[2]		; t0
+||	ROTL	$Td2[3],TBL2,$Td2[3]		; t1
+||	XOR	$K[0],$Td1[0],$s[0]
+||	XOR	$K[1],$Td3[1],$s[1]
+|| [B0]	BNOP	dec_loop?
+	ROTL	$Td1[3],TBL1,$Td3[2]		; t0
+||	ROTL	$Td3[2],TBL3,$Td1[3]		; t1
+||	XOR	$K[2],$Td3[0],$s[2]
+||	XOR	$K[3],$Td1[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Td2[0],TBL2,$Td2[0]		; t2
+||	ROTL	$Td2[1],TBL2,$Td2[1]		; t3
+||	XOR	$s[0],$Td2[2],$s[0]
+||	XOR	$s[1],$Td2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Td3[3],TBL3,$Td1[2]		; t2
+||	ROTL	$Td1[2],TBL1,$Td3[3]		; t3
+||	XOR	$s[0],$Td3[2],$s[0]
+||	XOR	$s[1],$Td1[3],$s[1]
+	XOR	$s[2],$Td2[0],$s[2]
+||	XOR	$s[3],$Td2[1],$s[3]
+||	XOR	$s[0],$Td0[0],$s[0]
+||	XOR	$s[1],$Td0[1],$s[1]
+	XOR	$s[2],$Td1[2],$s[2]
+||	XOR	$s[3],$Td3[3],$s[3]
+||	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+||[!B0]	ADD	${TEA},A0,${TEA}		; point to Td4
+||[!B0]	ADD	${TEB},A0,${TEB}
+;;====================================================================
+	LDBU	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDBU	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDBU	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDBU	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDBU	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDBU	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDBU	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDBU	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDBU	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDBU	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDBU	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDBU	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDBU	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDBU	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+	LDBU	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDBU	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Td0[0],$Td1[3],$Td0[0]
+||	PACK2	$Td0[1],$Td1[0],$Td0[1]
+	PACK2	$Td2[2],$Td3[1],$Td2[2]
+||	PACK2	$Td2[3],$Td3[2],$Td2[3]
+	PACKL4	$Td0[0],$Td2[2],$Td0[0]
+||	PACKL4	$Td0[1],$Td2[3],$Td0[1]
+	XOR	$K[0],$Td0[0],$Td0[0]		; s[0]
+||	XOR	$K[1],$Td0[1],$Td0[1]		; s[1]
+
+	PACK2	$Td0[2],$Td1[1],$Td0[2]
+||	PACK2	$Td0[3],$Td1[2],$Td0[3]
+	PACK2	$Td2[0],$Td3[3],$Td2[0]
+||	PACK2	$Td2[1],$Td3[0],$Td2[1]
+||	BNOP	RA
+	PACKL4	$Td0[2],$Td2[0],$Td0[2]
+||	PACKL4	$Td0[3],$Td2[1],$Td0[3]
+	XOR	$K[2],$Td0[2],$Td0[2]		; s[2]
+||	XOR	$K[3],$Td0[3],$Td0[3]		; s[3]
+
+	MV	$Td0[0],A9
+||	MV	$Td0[1],A8
+	MV	$Td0[2],B9
+||	MV	$Td0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Td1[3],$Td0[0],$Td1[3]
+||	PACK2	$Td1[0],$Td0[1],$Td1[0]
+	PACK2	$Td3[1],$Td2[2],$Td3[1]
+||	PACK2	$Td3[2],$Td2[3],$Td3[2]
+	PACKL4	$Td3[1],$Td1[3],$Td1[3]
+||	PACKL4	$Td3[2],$Td1[0],$Td1[0]
+	XOR	$K[0],$Td1[3],$Td1[3]		; s[0]
+||	XOR	$K[1],$Td1[0],$Td1[0]		; s[1]
+
+	PACK2	$Td1[1],$Td0[2],$Td1[1]
+||	PACK2	$Td1[2],$Td0[3],$Td1[2]
+	PACK2	$Td3[3],$Td2[0],$Td3[3]
+||	PACK2	$Td3[0],$Td2[1],$Td3[0]
+||	BNOP	RA
+	PACKL4	$Td3[3],$Td1[1],$Td1[1]
+||	PACKL4	$Td3[0],$Td1[2],$Td1[2]
+	XOR	$K[2],$Td1[1],$Td1[1]		; s[2]
+||	XOR	$K[3],$Td1[2],$Td1[2]		; s[3]
+
+	MV	$Td1[3],A8
+||	MV	$Td1[0],A9
+	MV	$Td1[1],B8
+||	MV	$Td1[2],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+___
+{
+my @K=(@K,@s);			# extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0;			# used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+	.asg	OUT,BITS
+
+	.global	_AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+	.asmfunc
+	MV	INP,A0
+||	SHRU	BITS,5,BITS			; 128-192-256 -> 4-6-8
+||	MV	KEY,A1
+  [!A0]	B	RA
+||[!A0]	MVK	-1,RET
+||[!A0]	MVK	1,A1				; only one B RA
+  [!A1]	B	RA
+||[!A1]	MVK	-1,RET
+||[!A1]	MVK	0,A0
+||	MVK	0,B0
+||	MVK	0,A1
+   [A0]	LDNDW	*INP++,A9:A8
+|| [A0]	CMPEQ	4,BITS,B0
+|| [A0]	CMPLT	3,BITS,A1
+   [B0]	B	key128?
+|| [A1]	LDNDW	*INP++,B9:B8
+|| [A0]	CMPEQ	6,BITS,B0
+|| [A0]	CMPLT	5,BITS,A1
+   [B0]	B	key192?
+|| [A1]	LDNDW	*INP++,B17:B16
+|| [A0]	CMPEQ	8,BITS,B0
+|| [A0]	CMPLT	7,BITS,A1
+   [B0]	B	key256?
+|| [A1]	LDNDW	*INP++,B19:B18
+
+	.if	__TI_EABI__
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.else
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	(AES_Te4-__set_encrypt_key),$TEA
+|| [A0]	ADDKPC	__set_encrypt_key,B6
+   [A0]	MVKH	(AES_Te4-__set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	.endif
+	NOP
+	NOP
+
+	BNOP	RA,5
+||	MVK	-2,RET				; unknown bit length
+||	MVK	0,B0				; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$Te4[2]
+||	MV	B8,$K[3]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$Te4[2]
+||	MV	B9,$K[3]
+	.endif
+
+	MVK	256,A0
+||	MVK	8,B0
+
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop128?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[2]
+||	EXTU	$K[3],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[3],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	BDEC	loop128?,B0
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	BDEC	loop128?,B0
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+;;====================================================================
+	BNOP	RA
+	MV	$Te4[2],$K[2]
+||	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	10,B0				; rounds
+	STW	B0,*++${KPB}[15]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$Te4[2]
+||	MV	B16,$K[5]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$Te4[2]
+||	MV	B17,$K[5]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop192?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[4]
+||	EXTU	$K[5],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[5],A0
+||	EXTU	$K[5],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[5],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	BDEC	loop192?,B0
+||	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+	MV	$Te4[2],$K[2]
+||	XOR	$K[3],$K[4],$Te4[2]		; K[4]
+	XOR	$Te4[2],$K[5],$K[5]		; K[5]
+;;====================================================================
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	12,B0				; rounds
+	STW	B0,*++${KPB}[7]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$K[4]
+||	MV	B16,$K[5]
+||	MV	B19,$Te4[2]
+||	MV	B18,$K[7]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$K[4]
+||	MV	B17,$K[5]
+||	MV	B18,$Te4[2]
+||	MV	B19,$K[7]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop256?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[6]
+||	EXTU	$K[7],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[7],A0
+||	EXTU	$K[7],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[7],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+	STW	$K[6],*$KPA++[2]
+||	STW	$K[7],*$KPB++[2]
+||	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+||[!B0]	B	done256?
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+||[!B0]	B	done256?
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+
+	MV	$Te4[2],$K[2]
+|| [B0]	EXTU	$K[3],EXT0,24,$Te4[0]
+|| [B0]	SUB	B0,1,B0
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT1,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT2,24,A0
+||	EXTU	$K[3],EXT3,24,$Te4[3]
+
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	NOP	3
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	NOP	3
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+
+	XOR	$Te4[3],$K[4],$Te4[0]		; K[4]
+	XOR	$Te4[0],$K[5],$K[5]		; K[5]
+	MV	$Te4[0],$K[4]
+||	XOR	$K[5],$K[6],$Te4[2]		; K[6]
+	XOR	$Te4[2],$K[7],$K[7]		; K[7]
+;;====================================================================
+done256?:
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	14,B0				; rounds
+	STW	B0,*--${KPB}[1]
+	MVK	0,RET
+	.endasmfunc
+
+	.global	_AES_set_decrypt_key
+_AES_set_decrypt_key:
+	.asmfunc
+	B	__set_encrypt_key		; guarantee local call
+	MV	KEY,B30				; B30 is not modified
+	MV	RA, B31				; B31 is not modified
+	ADDKPC	ret?,RA,2
+ret?:						; B0 holds rounds or zero
+  [!B0]	BNOP	B31				; return if zero
+   [B0]	SHL	B0,4,A0				; offset to last round key
+   [B0]	SHRU	B0,1,B2
+   [B0]	SUB	B2,2,B2
+|| [B0]	MVK	0x0000001B,B3			; AES polynomial
+   [B0]	MVKH	0x07000000,B3
+|| [B0]	MV	B30,$KPA
+   [B0]	ADD	B30,A0,$KPB
+|| [B0]	MVK	16,A0				; sizeof(round key)
+;;====================================================================
+flip_loop?:
+	LDW	*${KPA}[0],A16
+||	LDW	*${KPB}[0],B16
+	LDW	*${KPA}[1],A17
+||	LDW	*${KPB}[1],B17
+	LDW	*${KPA}[2],A18
+||	LDW	*${KPB}[2],B18
+	LDW	*${KPA}[3],A19
+||	ADD	$KPA,A0,$KPA
+||	LDW	*${KPB}[3],B19
+||	SUB	$KPB,A0,$KPB
+||	BDEC	flip_loop?,B2
+	NOP
+	STW	B16,*${KPA}[-4]
+||	STW	A16,*${KPB}[4]
+	STW	B17,*${KPA}[-3]
+||	STW	A17,*${KPB}[5]
+	STW	B18,*${KPA}[-2]
+||	STW	A18,*${KPB}[6]
+	STW	B19,*${KPA}[-1]
+||	STW	A19,*${KPB}[7]
+;;====================================================================
+	SUB	B0,1,B0				; skip last round
+||	ADD	B30,A0,$KPA			; skip first round
+||	ADD	B30,A0,$KPB
+||	MVC	GFPGFR,B30			; save GFPGFR
+	LDW	*${KPA}[0],$K[0]
+||	LDW	*${KPB}[1],$K[1]
+||	MVC	B3,GFPGFR
+	LDW	*${KPA}[2],$K[2]
+||	LDW	*${KPB}[3],$K[3]
+	MVK	0x00000909,A24
+||	MVK	0x00000B0B,B24
+	MVKH	0x09090000,A24
+||	MVKH	0x0B0B0000,B24
+	SUB	B0,1,B0
+
+	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+||	GMPY4	$K[1],A24,$Kx9[1]
+||	MVK	0x00000D0D,A25
+||	MVK	0x00000E0E,B25
+	GMPY4	$K[2],A24,$Kx9[2]
+||	GMPY4	$K[3],A24,$Kx9[3]
+||	MVKH	0x0D0D0000,A25
+||	MVKH	0x0E0E0000,B25
+
+	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+||	GMPY4	$K[1],B24,$KxB[1]
+	GMPY4	$K[2],B24,$KxB[2]
+||	GMPY4	$K[3],B24,$KxB[3]
+
+;;====================================================================
+invmix_loop?:
+	GMPY4	$K[0],A25,$KxD[0]		; ·0x0D
+||	GMPY4	$K[1],A25,$KxD[1]
+||	SWAP2	$Kx9[0],$Kx9[0]			; rotate by 16
+||	SWAP2	$Kx9[1],$Kx9[1]
+||	MV	$K[0],$s[0]			; this or DINT
+||	MV	$K[1],$s[1]
+|| [B0]	LDW	*${KPA}[4],$K[0]
+|| [B0]	LDW	*${KPB}[5],$K[1]
+	GMPY4	$K[2],A25,$KxD[2]
+||	GMPY4	$K[3],A25,$KxD[3]
+||	SWAP2	$Kx9[2],$Kx9[2]
+||	SWAP2	$Kx9[3],$Kx9[3]
+||	MV	$K[2],$s[2]
+||	MV	$K[3],$s[3]
+|| [B0]	LDW	*${KPA}[6],$K[2]
+|| [B0]	LDW	*${KPB}[7],$K[3]
+
+	GMPY4	$s[0],B25,$KxE[0]		; ·0x0E
+||	GMPY4	$s[1],B25,$KxE[1]
+||	XOR	$Kx9[0],$KxB[0],$KxB[0]
+||	XOR	$Kx9[1],$KxB[1],$KxB[1]
+	GMPY4	$s[2],B25,$KxE[2]
+||	GMPY4	$s[3],B25,$KxE[3]
+||	XOR	$Kx9[2],$KxB[2],$KxB[2]
+||	XOR	$Kx9[3],$KxB[3],$KxB[3]
+
+	ROTL	$KxB[0],TBL3,$KxB[0]
+||	ROTL	$KxB[1],TBL3,$KxB[1]
+||	SWAP2	$KxD[0],$KxD[0]			; rotate by 16
+||	SWAP2	$KxD[1],$KxD[1]
+	ROTL	$KxB[2],TBL3,$KxB[2]
+||	ROTL	$KxB[3],TBL3,$KxB[3]
+||	SWAP2	$KxD[2],$KxD[2]
+||	SWAP2	$KxD[3],$KxD[3]
+|| [B0]	B	invmix_loop?
+
+	XOR	$KxE[0],$KxD[0],$KxE[0]
+||	XOR	$KxE[1],$KxD[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; ·0x09
+|| [B0]	GMPY4	$K[1],A24,$Kx9[1]
+||	ADDAW	$KPA,4,$KPA
+	XOR	$KxE[2],$KxD[2],$KxE[2]
+||	XOR	$KxE[3],$KxD[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],A24,$Kx9[2]
+|| [B0]	GMPY4	$K[3],A24,$Kx9[3]
+||	ADDAW	$KPB,4,$KPB
+
+	XOR	$KxB[0],$KxE[0],$KxE[0]
+||	XOR	$KxB[1],$KxE[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; ·0x0B
+|| [B0]	GMPY4	$K[1],B24,$KxB[1]
+	XOR	$KxB[2],$KxE[2],$KxE[2]
+||	XOR	$KxB[3],$KxE[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],B24,$KxB[2]
+|| [B0]	GMPY4	$K[3],B24,$KxB[3]
+||	STW	$KxE[0],*${KPA}[-4]
+||	STW	$KxE[1],*${KPB}[-3]
+	STW	$KxE[2],*${KPA}[-2]
+||	STW	$KxE[3],*${KPB}[-1]
+|| [B0]	SUB	B0,1,B0
+;;====================================================================
+	BNOP	B31,3
+	MVC	B30,GFPGFR			; restore GFPGFR(*)
+	MVK	0,RET
+	.endasmfunc
+___
+# (*)	Even though ABI doesn't specify GFPGFR as non-volatile, there
+#	are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+	.global	_AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+	.asmfunc
+	LDNDW	*${ivp}[0],A31:A30	; load counter value
+||	MV	$blocks,A2		; reassign $blocks
+||	MV	RA,B27			; reassign RA
+||	MV	$key,B26		; reassign $key
+	LDNDW	*${ivp}[1],B31:B30
+||	MVK	0,B2			; don't let __encrypt load input
+||	MVK	0,A1			; and postpone writing output
+	.if	.BIG_ENDIAN
+	NOP
+	.else
+	NOP	4
+	SWAP2	B31,B31			; keep least significant 32 bits
+	SWAP4	B31,B31			; in host byte order
+	.endif
+ctr32_loop?:
+   [A2]	BNOP	__encrypt
+|| [A1]	XOR	A29,A9,A9		; input^Ek(counter)
+|| [A1]	XOR	A28,A8,A8
+|| [A2]	LDNDW	*INP++,A29:A28		; load input
+  [!A2]	BNOP	B27			; return
+|| [A1]	XOR	B29,B9,B9
+|| [A1]	XOR	B28,B8,B8
+|| [A2]	LDNDW	*INP++,B29:B28
+	.if	.BIG_ENDIAN
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	MV	A31,A9			; pass counter value to __encrypt
+|| [A2]	MV	A30,A8			; pass counter value to __encrypt
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	DMV	B31,B30,B9:B8
+|| [A2]	ADD	B30,1,B30		; counter++
+	.else
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	MV	A31,A9
+|| [A2]	MV	A30,A8
+|| [A2]	SWAP2	B31,B0
+|| [A2]	ADD	B31,1,B31		; counter++
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	MV	B30,B8
+|| [A2]	SWAP4	B0,B9
+	.endif
+   [A2]	ADDKPC	ctr32_loop?,RA		; return to ctr32_loop?
+|| [A2]	MV	B26,KEY			; pass $key
+|| [A2]	SUB	A2,1,A2			; $blocks--
+||[!A1]	MVK	1,A1
+	NOP
+	NOP
+	.endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+	.if	__TI_EABI__
+	.sect	".text:aes_asm.const"
+	.else
+	.sect	".const:aes_asm"
+	.endif
+	.align	128
+AES_Te:
+	.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84
+	.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+	.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+	.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+	.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+	.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+	.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+	.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+	.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+	.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+	.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+	.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+	.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+	.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+	.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+	.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+	.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+	.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+	.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+	.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+	.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+	.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+	.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+	.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+	.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+	.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+	.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+	.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+	.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+	.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+	.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+	.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+	.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+	.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+	.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+	.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+	.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+	.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+	.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+	.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+	.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+	.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+	.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+	.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+	.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+	.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+	.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+	.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+	.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+	.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+	.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+	.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+	.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+	.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+	.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+	.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+	.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+	.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+	.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+	.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+	.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+	.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+	.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+	.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+	.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+	.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+	.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+	.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+	.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+	.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+	.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+	.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+	.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+	.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+	.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+	.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+	.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+	.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+	.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+	.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+	.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+	.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+	.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+	.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+	.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+	.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+	.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+	.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+	.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+	.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+	.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+	.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+	.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+	.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+	.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+	.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+	.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+	.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+	.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+	.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+	.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+	.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+	.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+	.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+	.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+	.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+	.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+	.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+	.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+	.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+	.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+	.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+	.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+	.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+	.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+	.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+	.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+	.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+	.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+	.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+	.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+	.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+	.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+	.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+	.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+	.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+	.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+	.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+AES_Te4:
+	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+	.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00
+	.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+	.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+	.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+	.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+	.align	128
+AES_Td:
+	.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53
+	.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+	.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+	.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+	.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+	.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+	.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+	.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+	.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+	.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+	.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+	.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+	.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+	.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+	.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+	.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+	.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+	.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+	.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+	.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+	.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+	.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+	.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+	.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+	.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+	.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+	.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+	.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+	.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+	.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+	.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+	.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+	.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+	.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+	.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+	.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+	.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+	.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+	.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+	.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+	.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+	.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+	.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+	.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+	.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+	.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+	.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+	.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+	.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+	.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+	.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+	.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+	.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+	.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+	.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+	.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+	.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+	.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+	.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+	.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+	.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+	.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+	.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+	.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+	.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+	.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+	.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+	.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+	.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+	.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+	.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+	.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+	.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+	.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+	.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+	.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+	.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+	.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+	.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+	.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+	.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+	.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+	.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+	.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+	.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+	.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+	.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+	.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+	.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+	.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+	.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+	.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+	.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+	.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+	.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+	.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+	.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+	.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+	.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+	.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+	.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+	.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+	.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+	.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+	.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+	.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+	.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+	.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+	.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+	.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+	.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+	.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+	.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+	.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+	.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+	.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+	.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+	.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+	.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+	.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+	.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+	.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+	.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+	.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+	.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+	.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+	.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+	.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+AES_Td4:
+	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.cstring "AES for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64xplus.pl
new file mode 100644
index 0000000000..206d7dce88
--- /dev/null
+++ b/crypto/aes/asm/aes-c64xplus.pl
@@ -0,0 +1,1329 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x+.
+#
+# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*)	This means that there might be subtle correlation between data
+#	and timing and one can wonder if it can be ... attacked:-(
+#	On the other hand this also means that *if* one chooses to
+#	implement *4* T-tables variant [instead of 1 T-table as in
+#	this implementation, or in addition to], then one ought to
+#	*interleave* them. Even though it complicates addressing,
+#	references to interleaved tables would be guaranteed not to
+#	clash. I reckon that it should be possible to break 8 cycles
+#	per byte "barrier," i.e. improve by ~20%, naturally at the
+#	cost of 8x increased pressure on L1D. 8x because you'd have
+#	to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A4,INP
+	.asg	B4,OUT
+	.asg	A6,KEY
+	.asg	A4,RET
+	.asg	B15,SP
+
+	.eval	24,EXT0
+	.eval	16,EXT1
+	.eval	8,EXT2
+	.eval	0,EXT3
+	.eval	8,TBL1
+	.eval	16,TBL2
+	.eval	24,TBL3
+
+	.if	.BIG_ENDIAN
+	.eval	24-EXT0,EXT0
+	.eval	24-EXT1,EXT1
+	.eval	24-EXT2,EXT2
+	.eval	24-EXT3,EXT3
+	.eval	32-TBL1,TBL1
+	.eval	32-TBL2,TBL2
+	.eval	32-TBL3,TBL3
+	.endif
+
+	.global	_AES_encrypt
+_AES_encrypt:
+	.asmfunc
+	MVK	1,B2
+__encrypt:
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Te-_AES_encrypt),$TEA
+||	ADDKPC	_AES_encrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Te-_AES_encrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	LDW	*$KPA++[2],$Te0[0]		; zero round key
+||	LDW	*$KPB++[2],$Te0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Te
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Te)
+	LDW	*$KPA++[2],$Te0[2]
+||	LDW	*$KPB++[2],$Te0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Te0[0],$s[0],$s[0]
+||	XOR	$Te0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+	SUB	B0,2,B0
+
+	SPLOOPD	13
+||	MVC	B0,ILC
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+;;====================================================================
+	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+	LDW	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDW	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDW	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDW	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDW	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDW	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDW	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDW	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDW	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDW	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDW	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDW	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	ROTL	$Te1[1],TBL1,$Te3[0]		; t0
+||	ROTL	$Te3[0],TBL3,$Te1[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDW	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDW	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	ROTL	$Te3[1],TBL3,$Te1[0]		; t2
+||	ROTL	$Te1[0],TBL1,$Te3[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+	LDW	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDW	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	ROTL	$Te2[2],TBL2,$Te2[2]		; t0
+||	ROTL	$Te2[3],TBL2,$Te2[3]		; t1
+||	XOR	$K[0],$Te3[0],$s[0]
+||	XOR	$K[1],$Te1[1],$s[1]
+	ROTL	$Te3[3],TBL3,$Te1[2]		; t0
+||	ROTL	$Te1[2],TBL1,$Te3[3]		; t1
+||	XOR	$K[2],$Te1[0],$s[2]
+||	XOR	$K[3],$Te3[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Te2[0],TBL2,$Te2[0]		; t2
+||	ROTL	$Te2[1],TBL2,$Te2[1]		; t3
+||	XOR	$s[0],$Te2[2],$s[0]
+||	XOR	$s[1],$Te2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Te1[3],TBL1,$Te3[2]		; t2
+||	ROTL	$Te3[2],TBL3,$Te1[3]		; t3
+||	XOR	$s[0],$Te1[2],$s[0]
+||	XOR	$s[1],$Te3[3],$s[1]
+	XOR	$s[2],$Te2[0],$s[2]
+||	XOR	$s[3],$Te2[1],$s[3]
+||	XOR	$s[0],$Te0[0],$s[0]
+||	XOR	$s[1],$Te0[1],$s[1]
+	SPKERNEL
+||	XOR.L	$s[2],$Te3[2],$s[2]
+||	XOR.L	$s[3],$Te1[3],$s[3]
+;;====================================================================
+	ADD.D	${TEA},A0,${TEA}		; point to Te4
+||	ADD.D	${TEB},A0,${TEB}
+||	EXTU	$s[1],EXT1,24,$Te1[1]
+||	EXTU	$s[0],EXT3,24,$Te3[0]
+	LDBU	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
+||	LDBU	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
+||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Te0[0]
+||	EXTU	$s[1],EXT0,24,$Te0[1]
+	LDBU	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
+||	LDBU	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
+||	EXTU	$s[3],EXT3,24,$Te3[3]
+||	EXTU	$s[2],EXT1,24,$Te1[2]
+	LDBU	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
+||	LDBU	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
+||	EXTU	$s[2],EXT2,24,$Te2[2]
+||	EXTU	$s[3],EXT2,24,$Te2[3]
+	LDBU	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
+||	LDBU	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
+||	EXTU	$s[1],EXT3,24,$Te3[1]
+||	EXTU	$s[0],EXT1,24,$Te1[0]
+	LDBU	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
+||	LDBU	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
+||	EXTU	$s[3],EXT1,24,$Te1[3]
+||	EXTU	$s[2],EXT3,24,$Te3[2]
+	LDBU	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
+||	LDBU	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
+||	EXTU	$s[2],EXT0,24,$Te0[2]
+||	EXTU	$s[3],EXT0,24,$Te0[3]
+	LDBU	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
+||	LDBU	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
+||	EXTU	$s[0],EXT2,24,$Te2[0]
+||	EXTU	$s[1],EXT2,24,$Te2[1]
+	LDBU	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
+||	LDBU	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Te0[0],$Te1[1],$Te0[0]
+||	PACK2	$Te0[1],$Te1[2],$Te0[1]
+	PACK2	$Te2[2],$Te3[3],$Te2[2]
+||	PACK2	$Te2[3],$Te3[0],$Te2[3]
+	PACKL4	$Te0[0],$Te2[2],$Te0[0]
+||	PACKL4	$Te0[1],$Te2[3],$Te0[1]
+	XOR	$K[0],$Te0[0],$Te0[0]		; s[0]
+||	XOR	$K[1],$Te0[1],$Te0[1]		; s[1]
+
+	PACK2	$Te0[2],$Te1[3],$Te0[2]
+||	PACK2	$Te0[3],$Te1[0],$Te0[3]
+	PACK2	$Te2[0],$Te3[1],$Te2[0]
+||	PACK2	$Te2[1],$Te3[2],$Te2[1]
+||	BNOP	RA
+	PACKL4	$Te0[2],$Te2[0],$Te0[2]
+||	PACKL4	$Te0[3],$Te2[1],$Te0[3]
+	XOR	$K[2],$Te0[2],$Te0[2]		; s[2]
+||	XOR	$K[3],$Te0[3],$Te0[3]		; s[3]
+
+	MV	$Te0[0],A9
+||	MV	$Te0[1],A8
+	MV	$Te0[2],B9
+||	MV	$Te0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Te1[1],$Te0[0],$Te1[1]
+||	PACK2	$Te1[2],$Te0[1],$Te1[2]
+	PACK2	$Te3[3],$Te2[2],$Te3[3]
+||	PACK2	$Te3[0],$Te2[3],$Te3[0]
+	PACKL4	$Te3[3],$Te1[1],$Te1[1]
+||	PACKL4	$Te3[0],$Te1[2],$Te1[2]
+	XOR	$K[0],$Te1[1],$Te1[1]		; s[0]
+||	XOR	$K[1],$Te1[2],$Te1[2]		; s[1]
+
+	PACK2	$Te1[3],$Te0[2],$Te1[3]
+||	PACK2	$Te1[0],$Te0[3],$Te1[0]
+	PACK2	$Te3[1],$Te2[0],$Te3[1]
+||	PACK2	$Te3[2],$Te2[1],$Te3[2]
+||	BNOP	RA
+	PACKL4	$Te3[1],$Te1[3],$Te1[3]
+||	PACKL4	$Te3[2],$Te1[0],$Te1[0]
+	XOR	$K[2],$Te1[3],$Te1[3]		; s[2]
+||	XOR	$K[3],$Te1[0],$Te1[0]		; s[3]
+
+	MV	$Te1[1],A8
+||	MV	$Te1[2],A9
+	MV	$Te1[3],B8
+||	MV	$Te1[0],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+
+	.global	_AES_decrypt
+_AES_decrypt:
+	.asmfunc
+	MVK	1,B2
+__decrypt:
+   [B2]	LDNDW	*INP++,A9:A8			; load input
+||	MVKL	(AES_Td-_AES_decrypt),$TEA
+||	ADDKPC	_AES_decrypt,B0
+   [B2]	LDNDW	*INP++,B9:B8
+||	MVKH	(AES_Td-_AES_decrypt),$TEA
+||	ADD	0,KEY,$KPA
+||	ADD	4,KEY,$KPB
+	LDW	*$KPA++[2],$Td0[0]		; zero round key
+||	LDW	*$KPB++[2],$Td0[1]
+||	MVK	60,A0
+||	ADD	B0,$TEA,$TEA			; AES_Td
+	LDW	*KEY[A0],B0			; rounds
+||	MVK	1024,A0				; sizeof(AES_Td)
+	LDW	*$KPA++[2],$Td0[2]
+||	LDW	*$KPB++[2],$Td0[3]
+||	MV	$TEA,$TEB
+	NOP
+	.if	.BIG_ENDIAN
+	MV	A9,$s[0]
+||	MV	A8,$s[1]
+||	MV	B9,$s[2]
+||	MV	B8,$s[3]
+	.else
+	MV	A8,$s[0]
+||	MV	A9,$s[1]
+||	MV	B8,$s[2]
+||	MV	B9,$s[3]
+	.endif
+	XOR	$Td0[0],$s[0],$s[0]
+||	XOR	$Td0[1],$s[1],$s[1]
+||	LDW	*$KPA++[2],$K[0]		; 1st round key
+||	LDW	*$KPB++[2],$K[1]
+	SUB	B0,2,B0
+
+	SPLOOPD	13
+||	MVC	B0,ILC
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+;;====================================================================
+	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+	LDW	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDW	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDW	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDW	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDW	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDW	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDW	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDW	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDW	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDW	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDW	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDW	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	ROTL	$Td3[1],TBL3,$Td1[0]		; t0
+||	ROTL	$Td1[0],TBL1,$Td3[1]		; t1
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDW	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDW	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	ROTL	$Td1[1],TBL1,$Td3[0]		; t2
+||	ROTL	$Td3[0],TBL3,$Td1[1]		; t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+	LDW	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDW	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+||	ROTL	$Td2[2],TBL2,$Td2[2]		; t0
+||	ROTL	$Td2[3],TBL2,$Td2[3]		; t1
+||	XOR	$K[0],$Td1[0],$s[0]
+||	XOR	$K[1],$Td3[1],$s[1]
+	ROTL	$Td1[3],TBL1,$Td3[2]		; t0
+||	ROTL	$Td3[2],TBL3,$Td1[3]		; t1
+||	XOR	$K[2],$Td3[0],$s[2]
+||	XOR	$K[3],$Td1[1],$s[3]
+||	LDW	*$KPA++[2],$K[0]		; next round key
+||	LDW	*$KPB++[2],$K[1]
+	ROTL	$Td2[0],TBL2,$Td2[0]		; t2
+||	ROTL	$Td2[1],TBL2,$Td2[1]		; t3
+||	XOR	$s[0],$Td2[2],$s[0]
+||	XOR	$s[1],$Td2[3],$s[1]
+||	LDW	*$KPA++[2],$K[2]
+||	LDW	*$KPB++[2],$K[3]
+	ROTL	$Td3[3],TBL3,$Td1[2]		; t2
+||	ROTL	$Td1[2],TBL1,$Td3[3]		; t3
+||	XOR	$s[0],$Td3[2],$s[0]
+||	XOR	$s[1],$Td1[3],$s[1]
+	XOR	$s[2],$Td2[0],$s[2]
+||	XOR	$s[3],$Td2[1],$s[3]
+||	XOR	$s[0],$Td0[0],$s[0]
+||	XOR	$s[1],$Td0[1],$s[1]
+	SPKERNEL
+||	XOR.L	$s[2],$Td1[2],$s[2]
+||	XOR.L	$s[3],$Td3[3],$s[3]
+;;====================================================================
+	ADD.D	${TEA},A0,${TEA}		; point to Td4
+||	ADD.D	${TEB},A0,${TEB}
+||	EXTU	$s[1],EXT3,24,$Td3[1]
+||	EXTU	$s[0],EXT1,24,$Td1[0]
+	LDBU	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
+||	LDBU	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
+||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
+||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
+||	EXTU	$s[0],EXT0,24,$Td0[0]
+||	EXTU	$s[1],EXT0,24,$Td0[1]
+	LDBU	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
+||	LDBU	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
+||	EXTU	$s[2],EXT2,24,$Td2[2]
+||	EXTU	$s[3],EXT2,24,$Td2[3]
+	LDBU	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
+||	LDBU	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
+||	EXTU	$s[3],EXT1,24,$Td1[3]
+||	EXTU	$s[2],EXT3,24,$Td3[2]
+	LDBU	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
+||	LDBU	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
+||	EXTU	$s[1],EXT1,24,$Td1[1]
+||	EXTU	$s[0],EXT3,24,$Td3[0]
+	LDBU	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
+||	LDBU	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
+||	EXTU	$s[0],EXT2,24,$Td2[0]
+||	EXTU	$s[1],EXT2,24,$Td2[1]
+	LDBU	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
+||	LDBU	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
+||	EXTU	$s[3],EXT3,24,$Td3[3]
+||	EXTU	$s[2],EXT1,24,$Td1[2]
+	LDBU	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
+||	LDBU	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
+||	EXTU	$s[2],EXT0,24,$Td0[2]
+||	EXTU	$s[3],EXT0,24,$Td0[3]
+	LDBU	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
+||	LDBU	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
+
+	.if	.BIG_ENDIAN
+	PACK2	$Td0[0],$Td1[3],$Td0[0]
+||	PACK2	$Td0[1],$Td1[0],$Td0[1]
+	PACK2	$Td2[2],$Td3[1],$Td2[2]
+||	PACK2	$Td2[3],$Td3[2],$Td2[3]
+	PACKL4	$Td0[0],$Td2[2],$Td0[0]
+||	PACKL4	$Td0[1],$Td2[3],$Td0[1]
+	XOR	$K[0],$Td0[0],$Td0[0]		; s[0]
+||	XOR	$K[1],$Td0[1],$Td0[1]		; s[1]
+
+	PACK2	$Td0[2],$Td1[1],$Td0[2]
+||	PACK2	$Td0[3],$Td1[2],$Td0[3]
+	PACK2	$Td2[0],$Td3[3],$Td2[0]
+||	PACK2	$Td2[1],$Td3[0],$Td2[1]
+||	BNOP	RA
+	PACKL4	$Td0[2],$Td2[0],$Td0[2]
+||	PACKL4	$Td0[3],$Td2[1],$Td0[3]
+	XOR	$K[2],$Td0[2],$Td0[2]		; s[2]
+||	XOR	$K[3],$Td0[3],$Td0[3]		; s[3]
+
+	MV	$Td0[0],A9
+||	MV	$Td0[1],A8
+	MV	$Td0[2],B9
+||	MV	$Td0[3],B8
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.else
+	PACK2	$Td1[3],$Td0[0],$Td1[3]
+||	PACK2	$Td1[0],$Td0[1],$Td1[0]
+	PACK2	$Td3[1],$Td2[2],$Td3[1]
+||	PACK2	$Td3[2],$Td2[3],$Td3[2]
+	PACKL4	$Td3[1],$Td1[3],$Td1[3]
+||	PACKL4	$Td3[2],$Td1[0],$Td1[0]
+	XOR	$K[0],$Td1[3],$Td1[3]		; s[0]
+||	XOR	$K[1],$Td1[0],$Td1[0]		; s[1]
+
+	PACK2	$Td1[1],$Td0[2],$Td1[1]
+||	PACK2	$Td1[2],$Td0[3],$Td1[2]
+	PACK2	$Td3[3],$Td2[0],$Td3[3]
+||	PACK2	$Td3[0],$Td2[1],$Td3[0]
+||	BNOP	RA
+	PACKL4	$Td3[3],$Td1[1],$Td1[1]
+||	PACKL4	$Td3[0],$Td1[2],$Td1[2]
+	XOR	$K[2],$Td1[1],$Td1[1]		; s[2]
+||	XOR	$K[3],$Td1[2],$Td1[2]		; s[3]
+
+	MV	$Td1[3],A8
+||	MV	$Td1[0],A9
+	MV	$Td1[1],B8
+||	MV	$Td1[2],B9
+|| [B2]	STNDW	A9:A8,*OUT++
+   [B2]	STNDW	B9:B8,*OUT++
+	.endif
+	.endasmfunc
+___
+{
+my @K=(@K,@s);			# extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0;			# used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+	.asg	OUT,BITS
+
+	.global	_AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+	.asmfunc
+	MV	INP,A0
+||	SHRU	BITS,5,BITS			; 128-192-256 -> 4-6-8
+||	MV	KEY,A1
+  [!A0]	B	RA
+||[!A0]	MVK	-1,RET
+||[!A0]	MVK	1,A1				; only one B RA
+  [!A1]	B	RA
+||[!A1]	MVK	-1,RET
+||[!A1]	MVK	0,A0
+||	MVK	0,B0
+||	MVK	0,A1
+   [A0]	LDNDW	*INP++,A9:A8
+|| [A0]	CMPEQ	4,BITS,B0
+|| [A0]	CMPLT	3,BITS,A1
+   [B0]	B	key128?
+|| [A1]	LDNDW	*INP++,B9:B8
+|| [A0]	CMPEQ	6,BITS,B0
+|| [A0]	CMPLT	5,BITS,A1
+   [B0]	B	key192?
+|| [A1]	LDNDW	*INP++,B17:B16
+|| [A0]	CMPEQ	8,BITS,B0
+|| [A0]	CMPLT	7,BITS,A1
+   [B0]	B	key256?
+|| [A1]	LDNDW	*INP++,B19:B18
+
+   [A0]	ADD	0,KEY,$KPA
+|| [A0]	ADD	4,KEY,$KPB
+|| [A0]	MVKL	(AES_Te4-_AES_set_encrypt_key),$TEA
+|| [A0]	ADDKPC	_AES_set_encrypt_key,B6
+   [A0]	MVKH	(AES_Te4-_AES_set_encrypt_key),$TEA
+   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
+	NOP
+	NOP
+
+	BNOP	RA,5
+||	MVK	-2,RET				; unknown bit lenght
+||	MVK	0,B0				; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$Te4[2]
+||	MV	B8,$K[3]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$Te4[2]
+||	MV	B9,$K[3]
+	.endif
+
+	MVK	256,A0
+||	MVK	9,B0
+
+	SPLOOPD	14
+||	MVC	B0,ILC
+||	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[2]
+||	EXTU	$K[3],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[3],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+	SPKERNEL
+;;====================================================================
+	BNOP	RA
+	MV	$Te4[2],$K[2]
+||	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	10,B0				; rounds
+	STW	B0,*++${KPB}[15]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$Te4[2]
+||	MV	B16,$K[5]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$Te4[2]
+||	MV	B17,$K[5]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop192?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[4]
+||	EXTU	$K[5],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[5],A0
+||	EXTU	$K[5],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[5],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+
+	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+	BDEC	loop192?,B0
+||	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+	MV	$Te4[2],$K[2]
+||	XOR	$K[3],$K[4],$Te4[2]		; K[4]
+	XOR	$Te4[2],$K[5],$K[5]		; K[5]
+;;====================================================================
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	12,B0				; rounds
+	STW	B0,*++${KPB}[7]
+	MVK	0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+	.if	.BIG_ENDIAN
+	MV	A9,$K[0]
+||	MV	A8,$K[1]
+||	MV	B9,$K[2]
+||	MV	B8,$K[3]
+	MV	B17,$K[4]
+||	MV	B16,$K[5]
+||	MV	B19,$Te4[2]
+||	MV	B18,$K[7]
+	.else
+	MV	A8,$K[0]
+||	MV	A9,$K[1]
+||	MV	B8,$K[2]
+||	MV	B9,$K[3]
+	MV	B16,$K[4]
+||	MV	B17,$K[5]
+||	MV	B18,$Te4[2]
+||	MV	B19,$K[7]
+	.endif
+
+	MVK	256,A0
+||	MVK	6,B0
+	MV	$TEA,$TEB
+||	ADD	$TEA,A0,A30			; rcon
+;;====================================================================
+loop256?:
+	LDW	*A30++[1],A31			; rcon[i]
+||	MV	$Te4[2],$K[6]
+||	EXTU	$K[7],EXT1,24,$Te4[0]
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[7],A0
+||	EXTU	$K[7],EXT2,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT3,24,A0
+||	EXTU	$K[7],EXT0,24,$Te4[3]
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	.endif
+
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	STW	$K[4],*$KPA++[2]
+||	STW	$K[5],*$KPB++[2]
+	STW	$K[6],*$KPA++[2]
+||	STW	$K[7],*$KPB++[2]
+||	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
+	.if	.BIG_ENDIAN
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+||[!B0]	B	done256?
+	.else
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+||	PACK2	$Te4[3],A0,$Te4[3]
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+||[!B0]	B	done256?
+	.endif
+	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
+	XOR	$Te4[0],$K[1],$K[1]		; K[1]
+	MV	$Te4[0],$K[0]
+||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
+	XOR	$Te4[2],$K[3],$K[3]		; K[3]
+
+	MV	$Te4[2],$K[2]
+|| [B0]	EXTU	$K[3],EXT0,24,$Te4[0]
+|| [B0]	SUB	B0,1,B0
+	LDBU	*${TEB}[$Te4[0]],$Te4[0]
+||	MV	$K[3],A0
+||	EXTU	$K[3],EXT1,24,$Te4[1]
+	LDBU	*${TEB}[$Te4[1]],$Te4[1]
+||	EXTU	A0,EXT2,24,A0
+||	EXTU	$K[3],EXT3,24,$Te4[3]
+
+	.if	.BIG_ENDIAN
+	LDBU	*${TEA}[A0],$Te4[3]
+||	LDBU	*${TEB}[$Te4[3]],A0
+	NOP	3
+	PACK2	$Te4[0],$Te4[1],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[1],$Te4[3],$Te4[3]
+	.else
+	LDBU	*${TEA}[A0],A0
+||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
+	NOP	3
+	PACK2	$Te4[1],$Te4[0],$Te4[1]
+	PACK2	$Te4[3],A0,$Te4[3]
+||	B	loop256?
+	PACKL4	$Te4[3],$Te4[1],$Te4[3]
+	.endif
+
+	XOR	$Te4[3],$K[4],$Te4[0]		; K[4]
+	XOR	$Te4[0],$K[5],$K[5]		; K[5]
+	MV	$Te4[0],$K[4]
+||	XOR	$K[5],$K[6],$Te4[2]		; K[6]
+	XOR	$Te4[2],$K[7],$K[7]		; K[7]
+;;====================================================================
+done256?:
+	BNOP	RA
+	STW	$K[0],*$KPA++[2]
+||	STW	$K[1],*$KPB++[2]
+	STW	$K[2],*$KPA++[2]
+||	STW	$K[3],*$KPB++[2]
+	MVK	14,B0				; rounds
+	STW	B0,*--${KPB}[1]
+	MVK	0,RET
+	.endasmfunc
+
+	.global	_AES_set_decrypt_key
+_AES_set_decrypt_key:
+	.asmfunc
+	B	__set_encrypt_key		; guarantee local call
+	MV	KEY,B30				; B30 is not modified
+	MV	RA, B31				; B31 is not modified
+	ADDKPC	ret?,RA,2
+ret?:						; B0 holds rounds or zero
+  [!B0]	BNOP	B31				; return if zero
+   [B0]	SHL	B0,4,A0				; offset to last round key
+   [B0]	SHRU	B0,1,B1
+   [B0]	SUB	B1,1,B1
+   [B0]	MVK	0x0000001B,B3			; AES polynomial
+   [B0]	MVKH	0x07000000,B3
+
+	SPLOOPD	9				; flip round keys
+||	MVC	B1,ILC
+||	MV	B30,$KPA
+||	ADD	B30,A0,$KPB
+||	MVK	16,A0				; sizeof(round key)
+;;====================================================================
+	LDW	*${KPA}[0],A16
+||	LDW	*${KPB}[0],B16
+	LDW	*${KPA}[1],A17
+||	LDW	*${KPB}[1],B17
+	LDW	*${KPA}[2],A18
+||	LDW	*${KPB}[2],B18
+	LDW	*${KPA}[3],A19
+||	ADD	$KPA,A0,$KPA
+||	LDW	*${KPB}[3],B19
+||	SUB	$KPB,A0,$KPB
+	NOP
+	STW	B16,*${KPA}[-4]
+||	STW	A16,*${KPB}[4]
+	STW	B17,*${KPA}[-3]
+||	STW	A17,*${KPB}[5]
+	STW	B18,*${KPA}[-2]
+||	STW	A18,*${KPB}[6]
+	STW	B19,*${KPA}[-1]
+||	STW	A19,*${KPB}[7]
+	SPKERNEL
+;;====================================================================
+	SUB	B0,1,B0				; skip last round
+||	ADD	B30,A0,$KPA			; skip first round
+||	ADD	B30,A0,$KPB
+||	MVC	GFPGFR,B30			; save GFPGFR
+	LDW	*${KPA}[0],$K[0]
+||	LDW	*${KPB}[1],$K[1]
+||	MVC	B3,GFPGFR
+	LDW	*${KPA}[2],$K[2]
+||	LDW	*${KPB}[3],$K[3]
+	MVK	0x00000909,A24
+||	MVK	0x00000B0B,B24
+	MVKH	0x09090000,A24
+||	MVKH	0x0B0B0000,B24
+	MVC	B0,ILC
+||	SUB	B0,1,B0
+
+	GMPY4	$K[0],A24,$Kx9[0]		; �0x09
+||	GMPY4	$K[1],A24,$Kx9[1]
+||	MVK	0x00000D0D,A25
+||	MVK	0x00000E0E,B25
+	GMPY4	$K[2],A24,$Kx9[2]
+||	GMPY4	$K[3],A24,$Kx9[3]
+||	MVKH	0x0D0D0000,A25
+||	MVKH	0x0E0E0000,B25
+
+	GMPY4	$K[0],B24,$KxB[0]		; �0x0B
+||	GMPY4	$K[1],B24,$KxB[1]
+	GMPY4	$K[2],B24,$KxB[2]
+||	GMPY4	$K[3],B24,$KxB[3]
+
+	SPLOOP	11				; InvMixColumns
+;;====================================================================
+	GMPY4	$K[0],A25,$KxD[0]		; �0x0D
+||	GMPY4	$K[1],A25,$KxD[1]
+||	SWAP2	$Kx9[0],$Kx9[0]			; rotate by 16
+||	SWAP2	$Kx9[1],$Kx9[1]
+||	MV	$K[0],$s[0]			; this or DINT
+||	MV	$K[1],$s[1]
+|| [B0]	LDW	*${KPA}[4],$K[0]
+|| [B0]	LDW	*${KPB}[5],$K[1]
+	GMPY4	$K[2],A25,$KxD[2]
+||	GMPY4	$K[3],A25,$KxD[3]
+||	SWAP2	$Kx9[2],$Kx9[2]
+||	SWAP2	$Kx9[3],$Kx9[3]
+||	MV	$K[2],$s[2]
+||	MV	$K[3],$s[3]
+|| [B0]	LDW	*${KPA}[6],$K[2]
+|| [B0]	LDW	*${KPB}[7],$K[3]
+
+	GMPY4	$s[0],B25,$KxE[0]		; �0x0E
+||	GMPY4	$s[1],B25,$KxE[1]
+||	XOR	$Kx9[0],$KxB[0],$KxB[0]
+||	XOR	$Kx9[1],$KxB[1],$KxB[1]
+	GMPY4	$s[2],B25,$KxE[2]
+||	GMPY4	$s[3],B25,$KxE[3]
+||	XOR	$Kx9[2],$KxB[2],$KxB[2]
+||	XOR	$Kx9[3],$KxB[3],$KxB[3]
+
+	ROTL	$KxB[0],TBL3,$KxB[0]
+||	ROTL	$KxB[1],TBL3,$KxB[1]
+||	SWAP2	$KxD[0],$KxD[0]			; rotate by 16
+||	SWAP2	$KxD[1],$KxD[1]
+	ROTL	$KxB[2],TBL3,$KxB[2]
+||	ROTL	$KxB[3],TBL3,$KxB[3]
+||	SWAP2	$KxD[2],$KxD[2]
+||	SWAP2	$KxD[3],$KxD[3]
+
+	XOR	$KxE[0],$KxD[0],$KxE[0]
+||	XOR	$KxE[1],$KxD[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; �0x09
+|| [B0]	GMPY4	$K[1],A24,$Kx9[1]
+||	ADDAW	$KPA,4,$KPA
+	XOR	$KxE[2],$KxD[2],$KxE[2]
+||	XOR	$KxE[3],$KxD[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],A24,$Kx9[2]
+|| [B0]	GMPY4	$K[3],A24,$Kx9[3]
+||	ADDAW	$KPB,4,$KPB
+
+	XOR	$KxB[0],$KxE[0],$KxE[0]
+||	XOR	$KxB[1],$KxE[1],$KxE[1]
+|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; �0x0B
+|| [B0]	GMPY4	$K[1],B24,$KxB[1]
+	XOR	$KxB[2],$KxE[2],$KxE[2]
+||	XOR	$KxB[3],$KxE[3],$KxE[3]
+|| [B0]	GMPY4	$K[2],B24,$KxB[2]
+|| [B0]	GMPY4	$K[3],B24,$KxB[3]
+||	STW	$KxE[0],*${KPA}[-4]
+||	STW	$KxE[1],*${KPB}[-3]
+	STW	$KxE[2],*${KPA}[-2]
+||	STW	$KxE[3],*${KPB}[-1]
+|| [B0]	SUB	B0,1,B0
+	SPKERNEL
+;;====================================================================
+	BNOP	B31,3
+	MVC	B30,GFPGFR			; restore GFPGFR(*)
+	MVK	0,RET
+	.endasmfunc
+___
+# (*)	Even though ABI doesn't specify GFPGFR as non-volatile, there
+#	are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+	.global	_AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+	.asmfunc
+	LDNDW	*${ivp}[0],A31:A30	; load counter value
+||	MV	$blocks,A2		; reassign $blocks
+||	DMV	RA,$key,B27:B26		; reassign RA and $key
+	LDNDW	*${ivp}[1],B31:B30
+||	MVK	0,B2			; don't let __encrypt load input
+||	MVK	0,A1			; and postpone writing output
+	.if	.BIG_ENDIAN
+	NOP
+	.else
+	NOP	4
+	SWAP2	B31,B31			; keep least significant 32 bits
+	SWAP4	B31,B31			; in host byte order
+	.endif
+ctr32_loop?:
+   [A2]	BNOP	__encrypt
+|| [A1]	XOR	A29,A9,A9		; input^Ek(counter)
+|| [A1]	XOR	A28,A8,A8
+|| [A2]	LDNDW	*INP++,A29:A28		; load input
+  [!A2]	BNOP	B27			; return
+|| [A1]	XOR	B29,B9,B9
+|| [A1]	XOR	B28,B8,B8
+|| [A2]	LDNDW	*INP++,B29:B28
+	.if	.BIG_ENDIAN
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	DMV	A31,A30,A9:A8		; pass counter value to __encrypt
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	DMV	B31,B30,B9:B8
+|| [A2]	ADD	B30,1,B30		; counter++
+	.else
+   [A1]	STNDW	A9:A8,*OUT++		; save output
+|| [A2]	DMV	A31,A30,A9:A8
+|| [A2]	SWAP2	B31,B0
+|| [A2]	ADD	B31,1,B31		; counter++
+   [A1]	STNDW	B9:B8,*OUT++
+|| [A2]	MV	B30,B8
+|| [A2]	SWAP4	B0,B9
+	.endif
+   [A2]	ADDKPC	ctr32_loop?,RA		; return to ctr32_loop?
+|| [A2]	MV	B26,KEY			; pass $key
+|| [A2]	SUB	A2,1,A2			; $blocks--
+||[!A1]	MVK	1,A1
+	NOP
+	NOP
+	.endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+	.sect	".const:aes_asm"
+	.align	128
+AES_Te:
+	.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84
+	.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+	.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+	.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+	.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+	.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+	.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+	.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+	.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+	.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+	.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+	.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+	.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+	.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+	.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+	.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+	.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+	.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+	.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+	.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+	.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+	.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+	.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+	.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+	.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+	.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+	.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+	.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+	.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+	.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+	.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+	.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+	.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+	.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+	.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+	.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+	.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+	.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+	.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+	.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+	.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+	.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+	.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+	.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+	.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+	.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+	.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+	.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+	.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+	.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+	.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+	.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+	.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+	.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+	.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+	.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+	.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+	.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+	.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+	.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+	.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+	.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+	.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+	.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+	.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+	.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+	.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+	.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+	.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+	.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+	.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+	.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+	.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+	.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+	.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+	.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+	.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+	.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+	.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+	.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+	.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+	.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+	.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+	.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+	.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+	.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+	.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+	.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+	.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+	.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+	.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+	.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+	.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+	.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+	.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+	.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+	.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+	.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+	.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+	.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+	.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+	.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+	.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+	.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+	.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+	.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+	.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+	.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+	.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+	.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+	.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+	.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+	.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+	.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+	.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+	.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+	.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+	.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+	.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+	.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+	.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+	.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+	.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+	.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+	.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+	.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+	.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+	.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+AES_Te4:
+	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+	.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00
+	.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+	.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+	.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+	.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+	.align	128
+AES_Td:
+	.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53
+	.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+	.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+	.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+	.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+	.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+	.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+	.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+	.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+	.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+	.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+	.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+	.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+	.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+	.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+	.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+	.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+	.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+	.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+	.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+	.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+	.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+	.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+	.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+	.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+	.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+	.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+	.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+	.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+	.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+	.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+	.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+	.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+	.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+	.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+	.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+	.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+	.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+	.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+	.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+	.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+	.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+	.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+	.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+	.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+	.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+	.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+	.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+	.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+	.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+	.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+	.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+	.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+	.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+	.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+	.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+	.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+	.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+	.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+	.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+	.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+	.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+	.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+	.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+	.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+	.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+	.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+	.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+	.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+	.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+	.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+	.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+	.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+	.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+	.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+	.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+	.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+	.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+	.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+	.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+	.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+	.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+	.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+	.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+	.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+	.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+	.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+	.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+	.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+	.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+	.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+	.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+	.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+	.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+	.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+	.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+	.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+	.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+	.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+	.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+	.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+	.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+	.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+	.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+	.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+	.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+	.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+	.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+	.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+	.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+	.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+	.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+	.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+	.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+	.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+	.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+	.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+	.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+	.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+	.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+	.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+	.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+	.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+	.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+	.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+	.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+	.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+	.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+AES_Td4:
+	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
index 2ce6deffc8..76cf130e91 100644
--- a/crypto/aes/asm/aes-mips.pl
+++ b/crypto/aes/asm/aes-mips.pl
@@ -47,7 +47,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
 
 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -70,7 +70,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
 #
 ######################################################################
 
-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
 
 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
 open STDOUT,">$output";
@@ -89,7 +89,7 @@ $code.=<<___;
 # include <openssl/fipssyms.h>
 #endif
 
-#if !defined(__vxworks) || defined(__pic__)
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
 .option	pic2
 #endif
 .set	noat
diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl
index 7c52cbe5f9..58a98232d1 100644
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -548,7 +548,7 @@ Lenc_loop:
 	xor	$s2,$t2,$acc14
 	xor	$s3,$t3,$acc15
 	addi	$key,$key,16
-	bdnz-	Lenc_loop
+	bdnz	Lenc_loop
 
 	addi	$Tbl2,$Tbl0,2048
 	nop
@@ -982,7 +982,7 @@ Ldec_loop:
 	xor	$s2,$t2,$acc14
 	xor	$s3,$t3,$acc15
 	addi	$key,$key,16
-	bdnz-	Ldec_loop
+	bdnz	Ldec_loop
 
 	addi	$Tbl2,$Tbl0,2048
 	nop
diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
new file mode 100755
index 0000000000..7ef189d249
--- /dev/null
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -0,0 +1,3726 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS
+# POWER8[le]	3.96/0.72	0.74	1.1
+# POWER8[be]	3.75/0.65	0.66	1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
+	$STU	="stdu";
+	$POP	="ld";
+	$PUSH	="std";
+	$UCMP	="cmpld";
+	$SHL	="sldi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
+	$STU	="stwu";
+	$POP	="lwz";
+	$PUSH	="stw";
+	$UCMP	="cmplw";
+	$SHL	="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{	# Key setup procedures						#
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine	"any"
+
+.text
+
+.align	7
+rcon:
+.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
+.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
+.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
+.long	0,0,0,0						?asis
+Lconsts:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$ptr	 #vvvvv "distance between . and rcon
+	addi	$ptr,$ptr,-0x48
+	mtlr	r0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl	.${prefix}_set_encrypt_key
+.align	5
+.${prefix}_set_encrypt_key:
+Lset_encrypt_key:
+	mflr		r11
+	$PUSH		r11,$LRSAVE($sp)
+
+	li		$ptr,-1
+	${UCMP}i	$inp,0
+	beq-		Lenc_key_abort		# if ($inp==0) return -1;
+	${UCMP}i	$out,0
+	beq-		Lenc_key_abort		# if ($out==0) return -1;
+	li		$ptr,-2
+	cmpwi		$bits,128
+	blt-		Lenc_key_abort
+	cmpwi		$bits,256
+	bgt-		Lenc_key_abort
+	andi.		r0,$bits,0x3f
+	bne-		Lenc_key_abort
+
+	lis		r0,0xfff0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	bl		Lconsts
+	mtlr		r11
+
+	neg		r9,$inp
+	lvx		$in0,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	lvsr		$key,0,r9		# borrow $key
+	li		r8,0x20
+	cmpwi		$bits,192
+	lvx		$in1,0,$inp
+	le?vspltisb	$mask,0x0f		# borrow $mask
+	lvx		$rcon,0,$ptr
+	le?vxor		$key,$key,$mask		# adjust for byte swap
+	lvx		$mask,r8,$ptr
+	addi		$ptr,$ptr,0x10
+	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
+	li		$cnt,8
+	vxor		$zero,$zero,$zero
+	mtctr		$cnt
+
+	?lvsr		$outperm,0,$out
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$zero,$outmask,$outperm
+
+	blt		Loop128
+	addi		$inp,$inp,8
+	beq		L192
+	addi		$inp,$inp,8
+	b		L256
+
+.align	4
+Loop128:
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+	bdnz		Loop128
+
+	lvx		$rcon,0,$ptr		# last two round keys
+
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+
+	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vxor		$in0,$in0,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+
+	addi		$inp,$out,15		# 15 is not typo
+	addi		$out,$out,0x50
+
+	li		$rounds,10
+	b		Ldone
+
+.align	4
+L192:
+	lvx		$tmp,0,$inp
+	li		$cnt,4
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
+	vspltisb	$key,8			# borrow $key
+	mtctr		$cnt
+	vsububm		$mask,$mask,$key	# adjust the mask
+
+Loop192:
+	vperm		$key,$in1,$in1,$mask	# roate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	vcipherlast	$key,$key,$rcon
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+
+	 vsldoi		$stage,$zero,$in1,8
+	vspltw		$tmp,$in0,3
+	vxor		$tmp,$tmp,$in1
+	vsldoi		$in1,$zero,$in1,12	# >>32
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in1,$in1,$tmp
+	vxor		$in0,$in0,$key
+	vxor		$in1,$in1,$key
+	 vsldoi		$stage,$stage,$in0,8
+
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	 vsldoi		$stage,$in0,$in1,8
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	 vperm		$outtail,$stage,$stage,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vspltw		$tmp,$in0,3
+	vxor		$tmp,$tmp,$in1
+	vsldoi		$in1,$zero,$in1,12	# >>32
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in1,$in1,$tmp
+	vxor		$in0,$in0,$key
+	vxor		$in1,$in1,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$inp,$out,15		# 15 is not typo
+	 addi		$out,$out,16
+	bdnz		Loop192
+
+	li		$rounds,12
+	addi		$out,$out,0x20
+	b		Ldone
+
+.align	4
+L256:
+	lvx		$tmp,0,$inp
+	li		$cnt,7
+	li		$rounds,14
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
+	mtctr		$cnt
+
+Loop256:
+	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
+	vsldoi		$tmp,$zero,$in0,12	# >>32
+	 vperm		$outtail,$in1,$in1,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	vcipherlast	$key,$key,$rcon
+	 stvx		$stage,0,$out
+	 addi		$out,$out,16
+
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in0,$in0,$tmp
+	 vadduwm	$rcon,$rcon,$rcon
+	vxor		$in0,$in0,$key
+	 vperm		$outtail,$in0,$in0,$outperm	# rotate
+	 vsel		$stage,$outhead,$outtail,$outmask
+	 vmr		$outhead,$outtail
+	 stvx		$stage,0,$out
+	 addi		$inp,$out,15		# 15 is not typo
+	 addi		$out,$out,16
+	bdz		Ldone
+
+	vspltw		$key,$in0,3		# just splat
+	vsldoi		$tmp,$zero,$in1,12	# >>32
+	vsbox		$key,$key
+
+	vxor		$in1,$in1,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in1,$in1,$tmp
+	vsldoi		$tmp,$zero,$tmp,12	# >>32
+	vxor		$in1,$in1,$tmp
+
+	vxor		$in1,$in1,$key
+	b		Loop256
+
+.align	4
+Ldone:
+	lvx		$in1,0,$inp		# redundant in aligned case
+	vsel		$in1,$outhead,$in1,$outmask
+	stvx		$in1,0,$inp
+	li		$ptr,0
+	mtspr		256,$vrsave
+	stw		$rounds,0($out)
+
+Lenc_key_abort:
+	mr		r3,$ptr
+	blr
+	.long		0
+	.byte		0,12,0x14,1,0,0,3,0
+	.long		0
+.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl	.${prefix}_set_decrypt_key
+.align	5
+.${prefix}_set_decrypt_key:
+	$STU		$sp,-$FRAME($sp)
+	mflr		r10
+	$PUSH		r10,$FRAME+$LRSAVE($sp)
+	bl		Lset_encrypt_key
+	mtlr		r10
+
+	cmpwi		r3,0
+	bne-		Ldec_key_abort
+
+	slwi		$cnt,$rounds,4
+	subi		$inp,$out,240		# first round key
+	srwi		$rounds,$rounds,1
+	add		$out,$inp,$cnt		# last round key
+	mtctr		$rounds
+
+Ldeckey:
+	lwz		r0, 0($inp)
+	lwz		r6, 4($inp)
+	lwz		r7, 8($inp)
+	lwz		r8, 12($inp)
+	addi		$inp,$inp,16
+	lwz		r9, 0($out)
+	lwz		r10,4($out)
+	lwz		r11,8($out)
+	lwz		r12,12($out)
+	stw		r0, 0($out)
+	stw		r6, 4($out)
+	stw		r7, 8($out)
+	stw		r8, 12($out)
+	subi		$out,$out,16
+	stw		r9, -16($inp)
+	stw		r10,-12($inp)
+	stw		r11,-8($inp)
+	stw		r12,-4($inp)
+	bdnz		Ldeckey
+
+	xor		r3,r3,r3		# return value
+Ldec_key_abort:
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,4,1,0x80,0,3,0
+	.long		0
+.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{	# Single block en- and decrypt procedures			#
+sub gen_block () {
+my $dir = shift;
+my $n   = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl	.${prefix}_${dir}crypt
+.align	5
+.${prefix}_${dir}crypt:
+	lwz		$rounds,240($key)
+	lis		r0,0xfc00
+	mfspr		$vrsave,256
+	li		$idx,15			# 15 is not typo
+	mtspr		256,r0
+
+	lvx		v0,0,$inp
+	neg		r11,$out
+	lvx		v1,$idx,$inp
+	lvsl		v2,0,$inp		# inpperm
+	le?vspltisb	v4,0x0f
+	?lvsl		v3,0,r11		# outperm
+	le?vxor		v2,v2,v4
+	li		$idx,16
+	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
+	lvx		v1,0,$key
+	?lvsl		v5,0,$key		# keyperm
+	srwi		$rounds,$rounds,1
+	lvx		v2,$idx,$key
+	addi		$idx,$idx,16
+	subi		$rounds,$rounds,1
+	?vperm		v1,v1,v2,v5		# align round key
+
+	vxor		v0,v0,v1
+	lvx		v1,$idx,$key
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Loop_${dir}c:
+	?vperm		v2,v2,v1,v5
+	v${n}cipher	v0,v0,v2
+	lvx		v2,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		v1,v1,v2,v5
+	v${n}cipher	v0,v0,v1
+	lvx		v1,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_${dir}c
+
+	?vperm		v2,v2,v1,v5
+	v${n}cipher	v0,v0,v2
+	lvx		v2,$idx,$key
+	?vperm		v1,v1,v2,v5
+	v${n}cipherlast	v0,v0,v1
+
+	vspltisb	v2,-1
+	vxor		v1,v1,v1
+	li		$idx,15			# 15 is not typo
+	?vperm		v2,v1,v2,v3		# outmask
+	le?vxor		v3,v3,v4
+	lvx		v1,0,$out		# outhead
+	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
+	vsel		v1,v1,v0,v2
+	lvx		v4,$idx,$out
+	stvx		v1,0,$out
+	vsel		v0,v0,v4,v2
+	stvx		v0,$idx,$out
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,3,0
+	.long		0
+.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{	# CBC en- and decrypt procedures				#
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+						map("v$_",(4..10));
+$code.=<<___;
+.globl	.${prefix}_cbc_encrypt
+.align	5
+.${prefix}_cbc_encrypt:
+	${UCMP}i	$len,16
+	bltlr-
+
+	cmpwi		$enc,0			# test direction
+	lis		r0,0xffe0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	li		$idx,15
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	le?vspltisb	$tmp,0x0f
+
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$ivec,$ivec,$inptail,$inpperm
+
+	neg		r11,$inp
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
+	lwz		$rounds,240($key)
+
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+
+	srwi		$rounds,$rounds,1
+	li		$idx,16
+	subi		$rounds,$rounds,1
+	beq		Lcbc_dec
+
+Lcbc_enc:
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	mtctr		$rounds
+	subi		$len,$len,16		# len-=16
+
+	lvx		$rndkey0,0,$key
+	 vperm		$inout,$inout,$inptail,$inpperm
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	vxor		$inout,$inout,$ivec
+
+Loop_cbc_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_cbc_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$ivec,$inout,$rndkey0
+	${UCMP}i	$len,16
+
+	vperm		$tmp,$ivec,$ivec,$outperm
+	vsel		$inout,$outhead,$tmp,$outmask
+	vmr		$outhead,$tmp
+	stvx		$inout,0,$out
+	addi		$out,$out,16
+	bge		Lcbc_enc
+
+	b		Lcbc_done
+
+.align	4
+Lcbc_dec:
+	${UCMP}i	$len,128
+	bge		_aesp8_cbc_decrypt8x
+	vmr		$tmp,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	mtctr		$rounds
+	subi		$len,$len,16		# len-=16
+
+	lvx		$rndkey0,0,$key
+	 vperm		$tmp,$tmp,$inptail,$inpperm
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$tmp,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+
+Loop_cbc_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_cbc_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipherlast	$inout,$inout,$rndkey0
+	${UCMP}i	$len,16
+
+	vxor		$inout,$inout,$ivec
+	vmr		$ivec,$tmp
+	vperm		$tmp,$inout,$inout,$outperm
+	vsel		$inout,$outhead,$tmp,$outmask
+	vmr		$outhead,$tmp
+	stvx		$inout,0,$out
+	addi		$out,$out,16
+	bge		Lcbc_dec
+
+Lcbc_done:
+	addi		$out,$out,-1
+	lvx		$inout,0,$out		# redundant in aligned case
+	vsel		$inout,$outhead,$inout,$outmask
+	stvx		$inout,0,$out
+
+	neg		$enc,$ivp		# write [unaligned] iv
+	li		$idx,15			# 15 is not typo
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	vspltisb	$outmask,-1
+	le?vspltisb	$tmp,0x0f
+	?lvsl		$outperm,0,$enc
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+	lvx		$outhead,0,$ivp
+	vperm		$ivec,$ivec,$ivec,$outperm
+	vsel		$inout,$outhead,$ivec,$outmask
+	lvx		$inptail,$idx,$ivp
+	stvx		$inout,0,$ivp
+	vsel		$inout,$ivec,$inptail,$outmask
+	stvx		$inout,$idx,$ivp
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,6,0
+	.long		0
+___
+#########################################################################
+{{	# Optimized CBC decrypt procedure				#
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align	5
+_aesp8_cbc_decrypt8x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+	subi		$len,$len,128		# bias
+
+	lvx		$rndkey0,$x00,$key	# load key schedule
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	lvx		v31,$x00,$key
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_cbc_dec_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_cbc_dec_key
+
+	lvx		v26,$x10,$key
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$out0,$x70,$key		# borrow $out0
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$out0,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	#lvx		$inptail,0,$inp		# "caller" already did this
+	#addi		$inp,$inp,15		# 15 is not typo
+	subi		$inp,$inp,15		# undo "caller"
+
+	 le?li		$idx,8
+	lvx_u		$in0,$x00,$inp		# load first 8 "words"
+	 le?lvsl	$inpperm,0,$idx
+	 le?vspltisb	$tmp,0x0f
+	lvx_u		$in1,$x10,$inp
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
+	lvx_u		$in2,$x20,$inp
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	lvx_u		$in3,$x30,$inp
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	lvx_u		$in4,$x40,$inp
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	vxor		$out0,$in0,$rndkey0
+	lvx_u		$in5,$x50,$inp
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	vxor		$out1,$in1,$rndkey0
+	lvx_u		$in6,$x60,$inp
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	vxor		$out2,$in2,$rndkey0
+	lvx_u		$in7,$x70,$inp
+	addi		$inp,$inp,0x80
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	vxor		$out3,$in3,$rndkey0
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	vxor		$out4,$in4,$rndkey0
+	 le?vperm	$in7,$in7,$in7,$inpperm
+	vxor		$out5,$in5,$rndkey0
+	vxor		$out6,$in6,$rndkey0
+	vxor		$out7,$in7,$rndkey0
+
+	mtctr		$rounds
+	b		Loop_cbc_dec8x
+.align	5
+Loop_cbc_dec8x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_cbc_dec8x
+
+	subic		$len,$len,128		# $len-=128
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+
+	and		r0,r0,$len
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+	vncipher	$out6,$out6,v26
+	vncipher	$out7,$out7,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in7 are loaded
+						# with last "words"
+	vncipher	$out0,$out0,v27
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+	vncipher	$out6,$out6,v27
+	vncipher	$out7,$out7,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	vncipher	$out6,$out6,v28
+	vncipher	$out7,$out7,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	vncipher	$out6,$out6,v29
+	vncipher	$out7,$out7,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+
+	vncipher	$out0,$out0,v30
+	 vxor		$ivec,$ivec,v31		# xor with last round key
+	vncipher	$out1,$out1,v30
+	 vxor		$in0,$in0,v31
+	vncipher	$out2,$out2,v30
+	 vxor		$in1,$in1,v31
+	vncipher	$out3,$out3,v30
+	 vxor		$in2,$in2,v31
+	vncipher	$out4,$out4,v30
+	 vxor		$in3,$in3,v31
+	vncipher	$out5,$out5,v30
+	 vxor		$in4,$in4,v31
+	vncipher	$out6,$out6,v30
+	 vxor		$in5,$in5,v31
+	vncipher	$out7,$out7,v30
+	 vxor		$in6,$in6,v31
+
+	vncipherlast	$out0,$out0,$ivec
+	vncipherlast	$out1,$out1,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	vncipherlast	$out2,$out2,$in1
+	 lvx_u		$in1,$x10,$inp
+	vncipherlast	$out3,$out3,$in2
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	 lvx_u		$in2,$x20,$inp
+	vncipherlast	$out4,$out4,$in3
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	 lvx_u		$in3,$x30,$inp
+	vncipherlast	$out5,$out5,$in4
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	 lvx_u		$in4,$x40,$inp
+	vncipherlast	$out6,$out6,$in5
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	 lvx_u		$in5,$x50,$inp
+	vncipherlast	$out7,$out7,$in6
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	 lvx_u		$in6,$x60,$inp
+	vmr		$ivec,$in7
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	 lvx_u		$in7,$x70,$inp
+	 addi		$inp,$inp,0x80
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	 vxor		$out0,$in0,$rndkey0
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	 le?vperm	$in7,$in7,$in7,$inpperm
+	 vxor		$out1,$in1,$rndkey0
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$rndkey0
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$rndkey0
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$rndkey0
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	 vxor		$out5,$in5,$rndkey0
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x60,$out
+	 vxor		$out6,$in6,$rndkey0
+	stvx_u		$out7,$x70,$out
+	addi		$out,$out,0x80
+	 vxor		$out7,$in7,$rndkey0
+
+	mtctr		$rounds
+	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
+
+	addic.		$len,$len,128
+	beq		Lcbc_dec8x_done
+	nop
+	nop
+
+Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_cbc_dec8x_tail
+
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	vncipher	$out6,$out6,v24
+	vncipher	$out7,$out7,v24
+
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	vncipher	$out6,$out6,v25
+	vncipher	$out7,$out7,v25
+
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+	vncipher	$out6,$out6,v26
+	vncipher	$out7,$out7,v26
+
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+	vncipher	$out6,$out6,v27
+	vncipher	$out7,$out7,v27
+
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	vncipher	$out6,$out6,v28
+	vncipher	$out7,$out7,v28
+
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	vncipher	$out6,$out6,v29
+	vncipher	$out7,$out7,v29
+
+	vncipher	$out1,$out1,v30
+	 vxor		$ivec,$ivec,v31		# last round key
+	vncipher	$out2,$out2,v30
+	 vxor		$in1,$in1,v31
+	vncipher	$out3,$out3,v30
+	 vxor		$in2,$in2,v31
+	vncipher	$out4,$out4,v30
+	 vxor		$in3,$in3,v31
+	vncipher	$out5,$out5,v30
+	 vxor		$in4,$in4,v31
+	vncipher	$out6,$out6,v30
+	 vxor		$in5,$in5,v31
+	vncipher	$out7,$out7,v30
+	 vxor		$in6,$in6,v31
+
+	cmplwi		$len,32			# switch($len)
+	blt		Lcbc_dec8x_one
+	nop
+	beq		Lcbc_dec8x_two
+	cmplwi		$len,64
+	blt		Lcbc_dec8x_three
+	nop
+	beq		Lcbc_dec8x_four
+	cmplwi		$len,96
+	blt		Lcbc_dec8x_five
+	nop
+	beq		Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+	vncipherlast	$out1,$out1,$ivec
+	vncipherlast	$out2,$out2,$in1
+	vncipherlast	$out3,$out3,$in2
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out1,$out1,$out1,$inpperm
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x00,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x10,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x20,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x30,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x40,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x50,$out
+	stvx_u		$out7,$x60,$out
+	addi		$out,$out,0x70
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_six:
+	vncipherlast	$out2,$out2,$ivec
+	vncipherlast	$out3,$out3,$in2
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out2,$out2,$out2,$inpperm
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x00,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x10,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x20,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x30,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x40,$out
+	stvx_u		$out7,$x50,$out
+	addi		$out,$out,0x60
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_five:
+	vncipherlast	$out3,$out3,$ivec
+	vncipherlast	$out4,$out4,$in3
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out3,$out3,$out3,$inpperm
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x00,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x10,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x20,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x30,$out
+	stvx_u		$out7,$x40,$out
+	addi		$out,$out,0x50
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_four:
+	vncipherlast	$out4,$out4,$ivec
+	vncipherlast	$out5,$out5,$in4
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out4,$out4,$out4,$inpperm
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x00,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x10,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x20,$out
+	stvx_u		$out7,$x30,$out
+	addi		$out,$out,0x40
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_three:
+	vncipherlast	$out5,$out5,$ivec
+	vncipherlast	$out6,$out6,$in5
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out5,$out5,$out5,$inpperm
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x00,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x10,$out
+	stvx_u		$out7,$x20,$out
+	addi		$out,$out,0x30
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_two:
+	vncipherlast	$out6,$out6,$ivec
+	vncipherlast	$out7,$out7,$in6
+	vmr		$ivec,$in7
+
+	le?vperm	$out6,$out6,$out6,$inpperm
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x00,$out
+	stvx_u		$out7,$x10,$out
+	addi		$out,$out,0x20
+	b		Lcbc_dec8x_done
+
+.align	5
+Lcbc_dec8x_one:
+	vncipherlast	$out7,$out7,$ivec
+	vmr		$ivec,$in7
+
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out7,0,$out
+	addi		$out,$out,0x10
+
+Lcbc_dec8x_done:
+	le?vperm	$ivec,$ivec,$ivec,$inpperm
+	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
+
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$inpperm,r10,$sp	# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}}	}}}
+
+#########################################################################
+{{{	# CTR procedure[s]						#
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+						map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl	.${prefix}_ctr32_encrypt_blocks
+.align	5
+.${prefix}_ctr32_encrypt_blocks:
+	${UCMP}i	$len,1
+	bltlr-
+
+	lis		r0,0xfff0
+	mfspr		$vrsave,256
+	mtspr		256,r0
+
+	li		$idx,15
+	vxor		$rndkey0,$rndkey0,$rndkey0
+	le?vspltisb	$tmp,0x0f
+
+	lvx		$ivec,0,$ivp		# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	 vspltisb	$one,1
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$ivec,$ivec,$inptail,$inpperm
+	 vsldoi		$one,$rndkey0,$one,1
+
+	neg		r11,$inp
+	?lvsl		$keyperm,0,$key		# prepare for unaligned key
+	lwz		$rounds,240($key)
+
+	lvsr		$inpperm,0,r11		# prepare for unaligned load
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,15		# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	srwi		$rounds,$rounds,1
+	li		$idx,16
+	subi		$rounds,$rounds,1
+
+	${UCMP}i	$len,8
+	bge		_aesp8_ctr32_encrypt8x
+
+	?lvsr		$outperm,0,$out		# prepare for unaligned store
+	vspltisb	$outmask,-1
+	lvx		$outhead,0,$out
+	?vperm		$outmask,$rndkey0,$outmask,$outperm
+	le?vxor		$outperm,$outperm,$tmp
+
+	lvx		$rndkey0,0,$key
+	mtctr		$rounds
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$ivec,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	b		Loop_ctr32_enc
+
+.align	5
+Loop_ctr32_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key
+	addi		$idx,$idx,16
+	bdnz		Loop_ctr32_enc
+
+	vadduwm		$ivec,$ivec,$one
+	 vmr		$dat,$inptail
+	 lvx		$inptail,0,$inp
+	 addi		$inp,$inp,16
+	 subic.		$len,$len,1		# blocks--
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key
+	 vperm		$dat,$dat,$inptail,$inpperm
+	 li		$idx,16
+	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
+	 lvx		$rndkey0,0,$key
+	vxor		$dat,$dat,$rndkey1	# last round key
+	vcipherlast	$inout,$inout,$dat
+
+	 lvx		$rndkey1,$idx,$key
+	 addi		$idx,$idx,16
+	vperm		$inout,$inout,$inout,$outperm
+	vsel		$dat,$outhead,$inout,$outmask
+	 mtctr		$rounds
+	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vmr		$outhead,$inout
+	 vxor		$inout,$ivec,$rndkey0
+	 lvx		$rndkey0,$idx,$key
+	 addi		$idx,$idx,16
+	stvx		$dat,0,$out
+	addi		$out,$out,16
+	bne		Loop_ctr32_enc
+
+	addi		$out,$out,-1
+	lvx		$inout,0,$out		# redundant in aligned case
+	vsel		$inout,$outhead,$inout,$outmask
+	stvx		$inout,0,$out
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,6,0
+	.long		0
+___
+#########################################################################
+{{	# Optimized CTR procedure					#
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align	5
+_aesp8_ctr32_encrypt8x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key	# load key schedule
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	lvx		v31,$x00,$key
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_ctr32_enc_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key
+	addi		$key,$key,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_ctr32_enc_key
+
+	lvx		v26,$x10,$key
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$out0,$x70,$key		# borrow $out0
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$out0,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	vadduwm		$two,$one,$one
+	subi		$inp,$inp,15		# undo "caller"
+	$SHL		$len,$len,4
+
+	vadduwm		$out1,$ivec,$one	# counter values ...
+	vadduwm		$out2,$ivec,$two
+	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
+	 le?li		$idx,8
+	vadduwm		$out3,$out1,$two
+	vxor		$out1,$out1,$rndkey0
+	 le?lvsl	$inpperm,0,$idx
+	vadduwm		$out4,$out2,$two
+	vxor		$out2,$out2,$rndkey0
+	 le?vspltisb	$tmp,0x0f
+	vadduwm		$out5,$out3,$two
+	vxor		$out3,$out3,$rndkey0
+	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
+	vadduwm		$out6,$out4,$two
+	vxor		$out4,$out4,$rndkey0
+	vadduwm		$out7,$out5,$two
+	vxor		$out5,$out5,$rndkey0
+	vadduwm		$ivec,$out6,$two	# next counter value
+	vxor		$out6,$out6,$rndkey0
+	vxor		$out7,$out7,$rndkey0
+
+	mtctr		$rounds
+	b		Loop_ctr32_enc8x
+.align	5
+Loop_ctr32_enc8x:
+	vcipher 	$out0,$out0,v24
+	vcipher 	$out1,$out1,v24
+	vcipher 	$out2,$out2,v24
+	vcipher 	$out3,$out3,v24
+	vcipher 	$out4,$out4,v24
+	vcipher 	$out5,$out5,v24
+	vcipher 	$out6,$out6,v24
+	vcipher 	$out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher 	$out0,$out0,v25
+	vcipher 	$out1,$out1,v25
+	vcipher 	$out2,$out2,v25
+	vcipher 	$out3,$out3,v25
+	vcipher 	$out4,$out4,v25
+	vcipher 	$out5,$out5,v25
+	vcipher 	$out6,$out6,v25
+	vcipher 	$out7,$out7,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_ctr32_enc8x
+
+	subic		r11,$len,256		# $len-256, borrow $key_
+	vcipher 	$out0,$out0,v24
+	vcipher 	$out1,$out1,v24
+	vcipher 	$out2,$out2,v24
+	vcipher 	$out3,$out3,v24
+	vcipher 	$out4,$out4,v24
+	vcipher 	$out5,$out5,v24
+	vcipher 	$out6,$out6,v24
+	vcipher 	$out7,$out7,v24
+
+	subfe		r0,r0,r0		# borrow?-1:0
+	vcipher 	$out0,$out0,v25
+	vcipher 	$out1,$out1,v25
+	vcipher 	$out2,$out2,v25
+	vcipher 	$out3,$out3,v25
+	vcipher 	$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+	vcipher		$out6,$out6,v25
+	vcipher		$out7,$out7,v25
+
+	and		r0,r0,r11
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v26
+	vcipher		$out1,$out1,v26
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	vcipher		$out4,$out4,v26
+	vcipher		$out5,$out5,v26
+	vcipher		$out6,$out6,v26
+	vcipher		$out7,$out7,v26
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	subic		$len,$len,129		# $len-=129
+	vcipher		$out0,$out0,v27
+	addi		$len,$len,1		# $len-=128 really
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	vcipher		$out4,$out4,v27
+	vcipher		$out5,$out5,v27
+	vcipher		$out6,$out6,v27
+	vcipher		$out7,$out7,v27
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+
+	vcipher		$out0,$out0,v28
+	 lvx_u		$in0,$x00,$inp		# load input
+	vcipher		$out1,$out1,v28
+	 lvx_u		$in1,$x10,$inp
+	vcipher		$out2,$out2,v28
+	 lvx_u		$in2,$x20,$inp
+	vcipher		$out3,$out3,v28
+	 lvx_u		$in3,$x30,$inp
+	vcipher		$out4,$out4,v28
+	 lvx_u		$in4,$x40,$inp
+	vcipher		$out5,$out5,v28
+	 lvx_u		$in5,$x50,$inp
+	vcipher		$out6,$out6,v28
+	 lvx_u		$in6,$x60,$inp
+	vcipher		$out7,$out7,v28
+	 lvx_u		$in7,$x70,$inp
+	 addi		$inp,$inp,0x80
+
+	vcipher		$out0,$out0,v29
+	 le?vperm	$in0,$in0,$in0,$inpperm
+	vcipher		$out1,$out1,v29
+	 le?vperm	$in1,$in1,$in1,$inpperm
+	vcipher		$out2,$out2,v29
+	 le?vperm	$in2,$in2,$in2,$inpperm
+	vcipher		$out3,$out3,v29
+	 le?vperm	$in3,$in3,$in3,$inpperm
+	vcipher		$out4,$out4,v29
+	 le?vperm	$in4,$in4,$in4,$inpperm
+	vcipher		$out5,$out5,v29
+	 le?vperm	$in5,$in5,$in5,$inpperm
+	vcipher		$out6,$out6,v29
+	 le?vperm	$in6,$in6,$in6,$inpperm
+	vcipher		$out7,$out7,v29
+	 le?vperm	$in7,$in7,$in7,$inpperm
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in7 are loaded
+						# with last "words"
+	subfe.		r0,r0,r0		# borrow?-1:0
+	vcipher		$out0,$out0,v30
+	 vxor		$in0,$in0,v31		# xor with last round key
+	vcipher		$out1,$out1,v30
+	 vxor		$in1,$in1,v31
+	vcipher		$out2,$out2,v30
+	 vxor		$in2,$in2,v31
+	vcipher		$out3,$out3,v30
+	 vxor		$in3,$in3,v31
+	vcipher		$out4,$out4,v30
+	 vxor		$in4,$in4,v31
+	vcipher		$out5,$out5,v30
+	 vxor		$in5,$in5,v31
+	vcipher		$out6,$out6,v30
+	 vxor		$in6,$in6,v31
+	vcipher		$out7,$out7,v30
+	 vxor		$in7,$in7,v31
+
+	bne		Lctr32_enc8x_break	# did $len-129 borrow?
+
+	vcipherlast	$in0,$out0,$in0
+	vcipherlast	$in1,$out1,$in1
+	 vadduwm	$out1,$ivec,$one	# counter values ...
+	vcipherlast	$in2,$out2,$in2
+	 vadduwm	$out2,$ivec,$two
+	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
+	vcipherlast	$in3,$out3,$in3
+	 vadduwm	$out3,$out1,$two
+	 vxor		$out1,$out1,$rndkey0
+	vcipherlast	$in4,$out4,$in4
+	 vadduwm	$out4,$out2,$two
+	 vxor		$out2,$out2,$rndkey0
+	vcipherlast	$in5,$out5,$in5
+	 vadduwm	$out5,$out3,$two
+	 vxor		$out3,$out3,$rndkey0
+	vcipherlast	$in6,$out6,$in6
+	 vadduwm	$out6,$out4,$two
+	 vxor		$out4,$out4,$rndkey0
+	vcipherlast	$in7,$out7,$in7
+	 vadduwm	$out7,$out5,$two
+	 vxor		$out5,$out5,$rndkey0
+	le?vperm	$in0,$in0,$in0,$inpperm
+	 vadduwm	$ivec,$out6,$two	# next counter value
+	 vxor		$out6,$out6,$rndkey0
+	le?vperm	$in1,$in1,$in1,$inpperm
+	 vxor		$out7,$out7,$rndkey0
+	mtctr		$rounds
+
+	 vcipher	$out0,$out0,v24
+	stvx_u		$in0,$x00,$out
+	le?vperm	$in2,$in2,$in2,$inpperm
+	 vcipher	$out1,$out1,v24
+	stvx_u		$in1,$x10,$out
+	le?vperm	$in3,$in3,$in3,$inpperm
+	 vcipher	$out2,$out2,v24
+	stvx_u		$in2,$x20,$out
+	le?vperm	$in4,$in4,$in4,$inpperm
+	 vcipher	$out3,$out3,v24
+	stvx_u		$in3,$x30,$out
+	le?vperm	$in5,$in5,$in5,$inpperm
+	 vcipher	$out4,$out4,v24
+	stvx_u		$in4,$x40,$out
+	le?vperm	$in6,$in6,$in6,$inpperm
+	 vcipher	$out5,$out5,v24
+	stvx_u		$in5,$x50,$out
+	le?vperm	$in7,$in7,$in7,$inpperm
+	 vcipher	$out6,$out6,v24
+	stvx_u		$in6,$x60,$out
+	 vcipher	$out7,$out7,v24
+	stvx_u		$in7,$x70,$out
+	addi		$out,$out,0x80
+
+	b		Loop_ctr32_enc8x_middle
+
+.align	5
+Lctr32_enc8x_break:
+	cmpwi		$len,-0x60
+	blt		Lctr32_enc8x_one
+	nop
+	beq		Lctr32_enc8x_two
+	cmpwi		$len,-0x40
+	blt		Lctr32_enc8x_three
+	nop
+	beq		Lctr32_enc8x_four
+	cmpwi		$len,-0x20
+	blt		Lctr32_enc8x_five
+	nop
+	beq		Lctr32_enc8x_six
+	cmpwi		$len,0x00
+	blt		Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+	vcipherlast	$out0,$out0,$in0
+	vcipherlast	$out1,$out1,$in1
+	vcipherlast	$out2,$out2,$in2
+	vcipherlast	$out3,$out3,$in3
+	vcipherlast	$out4,$out4,$in4
+	vcipherlast	$out5,$out5,$in5
+	vcipherlast	$out6,$out6,$in6
+	vcipherlast	$out7,$out7,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	le?vperm	$out7,$out7,$out7,$inpperm
+	stvx_u		$out6,$x60,$out
+	stvx_u		$out7,$x70,$out
+	addi		$out,$out,0x80
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_seven:
+	vcipherlast	$out0,$out0,$in1
+	vcipherlast	$out1,$out1,$in2
+	vcipherlast	$out2,$out2,$in3
+	vcipherlast	$out3,$out3,$in4
+	vcipherlast	$out4,$out4,$in5
+	vcipherlast	$out5,$out5,$in6
+	vcipherlast	$out6,$out6,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	le?vperm	$out6,$out6,$out6,$inpperm
+	stvx_u		$out5,$x50,$out
+	stvx_u		$out6,$x60,$out
+	addi		$out,$out,0x70
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_six:
+	vcipherlast	$out0,$out0,$in2
+	vcipherlast	$out1,$out1,$in3
+	vcipherlast	$out2,$out2,$in4
+	vcipherlast	$out3,$out3,$in5
+	vcipherlast	$out4,$out4,$in6
+	vcipherlast	$out5,$out5,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	le?vperm	$out5,$out5,$out5,$inpperm
+	stvx_u		$out4,$x40,$out
+	stvx_u		$out5,$x50,$out
+	addi		$out,$out,0x60
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_five:
+	vcipherlast	$out0,$out0,$in3
+	vcipherlast	$out1,$out1,$in4
+	vcipherlast	$out2,$out2,$in5
+	vcipherlast	$out3,$out3,$in6
+	vcipherlast	$out4,$out4,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$inpperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_four:
+	vcipherlast	$out0,$out0,$in4
+	vcipherlast	$out1,$out1,$in5
+	vcipherlast	$out2,$out2,$in6
+	vcipherlast	$out3,$out3,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$inpperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	b		Lctr32_enc8x_done
+
+.align	5
+Lctr32_enc8x_three:
+	vcipherlast	$out0,$out0,$in5
+	vcipherlast	$out1,$out1,$in6
+	vcipherlast	$out2,$out2,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	le?vperm	$out2,$out2,$out2,$inpperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	b		Lcbc_dec8x_done
+
+.align	5
+Lctr32_enc8x_two:
+	vcipherlast	$out0,$out0,$in6
+	vcipherlast	$out1,$out1,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	le?vperm	$out1,$out1,$out1,$inpperm
+	stvx_u		$out0,$x00,$out
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	b		Lcbc_dec8x_done
+
+.align	5
+Lctr32_enc8x_one:
+	vcipherlast	$out0,$out0,$in7
+
+	le?vperm	$out0,$out0,$out0,$inpperm
+	stvx_u		$out0,0,$out
+	addi		$out,$out,0x10
+
+Lctr32_enc8x_done:
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$inpperm,r10,$sp	# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+	stvx		$inpperm,r10,$sp
+	addi		r10,r10,32
+	stvx		$inpperm,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}}	}}}
+
+#########################################################################
+{{{	# XTS procedures						#
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
+my $taillen = $key2;
+
+   ($inp,$idx) = ($idx,$inp);				# reassign
+
+$code.=<<___;
+.globl	.${prefix}_xts_encrypt
+.align	5
+.${prefix}_xts_encrypt:
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff0
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_encrypt6x
+
+	andi.		$taillen,$len,15
+	subic		r0,$len,32
+	subi		$taillen,$taillen,16
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+	b		Loop_xts_enc
+
+.align	5
+Loop_xts_enc:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_enc
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vcipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_enc_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	subic		r0,$len,32
+	subfe		r0,r0,r0
+	and		r0,r0,$taillen
+	add		$inp,$inp,r0
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$output,$output,$rndkey0	# just in case $len<16
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_enc
+
+	vxor		$output,$output,$tweak
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	subi		r11,$out,17
+	subi		$out,$out,16
+	mtctr		$len
+	li		$len,16
+Loop_xts_enc_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_enc_steal
+
+	mtctr		$rounds
+	b		Loop_xts_enc			# one more time...
+
+Lxts_enc_done:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl	.${prefix}_xts_decrypt
+.align	5
+.${prefix}_xts_decrypt:
+	mr		$inp,r3				# reassign
+	li		r3,-1
+	${UCMP}i	$len,16
+	bltlr-
+
+	lis		r0,0xfff8
+	mfspr		r12,256				# save vrsave
+	li		r11,0
+	mtspr		256,r0
+
+	andi.		r0,$len,15
+	neg		r0,r0
+	andi.		r0,r0,16
+	sub		$len,$len,r0
+
+	vspltisb	$seven,0x07			# 0x070707..07
+	le?lvsl		$leperm,r11,r11
+	le?vspltisb	$tmp,0x0f
+	le?vxor		$leperm,$leperm,$seven
+
+	li		$idx,15
+	lvx		$tweak,0,$ivp			# load [unaligned] iv
+	lvsl		$inpperm,0,$ivp
+	lvx		$inptail,$idx,$ivp
+	le?vxor		$inpperm,$inpperm,$tmp
+	vperm		$tweak,$tweak,$inptail,$inpperm
+
+	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
+	lwz		$rounds,240($key2)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	neg		r11,$inp
+	lvsr		$inpperm,0,r11			# prepare for unaligned load
+	lvx		$inout,0,$inp
+	addi		$inp,$inp,15			# 15 is not typo
+	le?vxor		$inpperm,$inpperm,$tmp
+
+	lvx		$rndkey0,0,$key2
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+Ltweak_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipher		$tweak,$tweak,$rndkey0
+	lvx		$rndkey0,$idx,$key2
+	addi		$idx,$idx,16
+	bdnz		Ltweak_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vcipher		$tweak,$tweak,$rndkey1
+	lvx		$rndkey1,$idx,$key2
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vcipherlast	$tweak,$tweak,$rndkey0
+
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+
+	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
+	lwz		$rounds,240($key1)
+	srwi		$rounds,$rounds,1
+	subi		$rounds,$rounds,1
+	li		$idx,16
+
+	vslb		$eighty7,$seven,$seven		# 0x808080..80
+	vor		$eighty7,$eighty7,$seven	# 0x878787..87
+	vspltisb	$tmp,1				# 0x010101..01
+	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
+
+	${UCMP}i	$len,96
+	bge		_aesp8_xts_decrypt6x
+
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	mtctr		$rounds
+
+	${UCMP}i	$len,16
+	blt		Ltail_xts_dec
+	be?b		Loop_xts_dec
+
+.align	5
+Loop_xts_dec:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+	addi		$out,$out,16
+
+	subic.		$len,$len,16
+	beq		Lxts_dec_done
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak,$tweak,$tmp
+
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$inout,$inout,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	mtctr		$rounds
+	${UCMP}i	$len,16
+	bge		Loop_xts_dec
+
+Ltail_xts_dec:
+	vsrab		$tmp,$tweak,$seven		# next tweak value
+	vaddubm		$tweak1,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	vxor		$tweak1,$tweak1,$tmp
+
+	subi		$inp,$inp,16
+	add		$inp,$inp,$len
+
+	vxor		$inout,$inout,$tweak		# :-(
+	vxor		$inout,$inout,$tweak1		# :-)
+
+Loop_xts_dec_short:
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vncipher	$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+	bdnz		Loop_xts_dec_short
+
+	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
+	vncipher	$inout,$inout,$rndkey1
+	lvx		$rndkey1,$idx,$key1
+	li		$idx,16
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+	vxor		$rndkey0,$rndkey0,$tweak1
+	vncipherlast	$output,$inout,$rndkey0
+
+	le?vperm	$tmp,$output,$output,$leperm
+	be?nop
+	le?stvx_u	$tmp,0,$out
+	be?stvx_u	$output,0,$out
+
+	vmr		$inout,$inptail
+	lvx		$inptail,0,$inp
+	#addi		$inp,$inp,16
+	lvx		$rndkey0,0,$key1
+	lvx		$rndkey1,$idx,$key1
+	addi		$idx,$idx,16
+	vperm		$inout,$inout,$inptail,$inpperm
+	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
+
+	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
+	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
+	vspltisb	$tmp,-1
+	vperm		$inptail,$inptail,$tmp,$inpperm
+	vsel		$inout,$inout,$output,$inptail
+
+	vxor		$rndkey0,$rndkey0,$tweak
+	vxor		$inout,$inout,$rndkey0
+	lvx		$rndkey0,$idx,$key1
+	addi		$idx,$idx,16
+
+	subi		r11,$out,1
+	mtctr		$len
+	li		$len,16
+Loop_xts_dec_steal:
+	lbzu		r0,1(r11)
+	stb		r0,16(r11)
+	bdnz		Loop_xts_dec_steal
+
+	mtctr		$rounds
+	b		Loop_xts_dec			# one more time...
+
+Lxts_dec_done:
+	mtspr		256,r12				# restore vrsave
+	li		r3,0
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,6,6,0
+	.long		0
+.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{	# Optimized XTS procedures					#
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+			# v26-v31 last 6 round keys
+my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align	5
+_aesp8_xts_encrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r0
+	li		r7,`$FRAME+8*16+15`
+	li		r8,`$FRAME+8*16+31`
+	$PUSH		r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r8,$sp
+	addi		r8,r8,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r8,$sp
+	addi		r8,r8,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r8,$sp
+	addi		r8,r8,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r8,$sp
+	addi		r8,r8,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r8,$sp
+	addi		r8,r8,32
+	stvx		v30,r7,$sp
+	stvx		v31,r8,$sp
+	mr		r7,r0
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_enc_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_enc_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	vxor		$tweak,$tweak,$tmp
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_enc6x
+
+.align	5
+Loop_xts_enc6x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc6x
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out4,$out4,v24
+	vcipher		$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vcipher		$out4,$out4,v25
+	vcipher		$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out0,$out0,v26
+	vcipher		$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out4,$out4,v26
+	vcipher		$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vcipher		$out0,$out0,v27
+	vcipher		$out1,$out1,v27
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out4,$out4,v27
+	vcipher		$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipher		$out4,$out4,v28
+	vcipher		$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vcipher		$out0,$out0,v29
+	vcipher		$out1,$out1,v29
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vcipher		$out4,$out4,v29
+	vcipher		$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+
+	vcipher		$out0,$out0,v30
+	vcipher		$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	 vxor		$tweak,$tweak,$tmp
+	vcipher		$out4,$out4,v30
+	vcipher		$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vcipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vcipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vcipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vcipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vcipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 vxor		$tweak,$tweak,$tmp
+	vcipherlast	$tmp,$out5,$in5		# last block might be needed
+						# in stealing mode
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$tmp,$tmp,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	le?stvx_u	$out5,$x50,$out
+	be?stvx_u	$tmp, $x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_enc6x		# did $len-=96 borrow?
+
+	addic.		$len,$len,0x60
+	beq		Lxts_enc6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_enc6x_one
+	nop
+	beq		Lxts_enc6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_enc6x_three
+	nop
+	beq		Lxts_enc6x_four
+
+Lxts_enc6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	vxor		$tmp,$out4,$twk5	# last block prep for stealing
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	vxor		$tmp,$out3,$twk4	# last block prep for stealing
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$tmp,$out2,$twk3	# last block prep for stealing
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_enc5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vxor		$tmp,$out1,$twk2	# last block prep for stealing
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_enc1x:
+	vcipher		$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_enc1x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+
+	lvsr		$inpperm,0,$taillen
+	vcipher		$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vcipher		$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vcipher		$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vxor		$tmp,$out0,$twk1	# last block prep for stealing
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	bne		Lxts_enc6x_steal
+	b		Lxts_enc6x_done
+
+.align	4
+Lxts_enc6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_enc6x_done
+
+	add		$inp,$inp,$taillen
+	subi		$inp,$inp,16
+	lvx_u		$in0,0,$inp
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	le?vperm	$in0,$in0,$in0,$leperm
+	vperm		$in0,$in0,$in0,$inpperm
+	vxor		$tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+	vxor		$in0,$in0,$twk0
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
+
+	subi		r3,$out,17
+	subi		$out,$out,16
+	mtctr		$taillen
+Loop_xts_enc6x_steal:
+	lbzu		r0,1(r3)
+	stb		r0,16(r3)
+	bdnz		Loop_xts_enc6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_enc1x		# one more time...
+
+.align	4
+Lxts_enc6x_done:
+	mtlr		r7
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_enc5x:
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_enc5x
+
+	add		$inp,$inp,$taillen
+	cmpwi		$taillen,0
+	vcipher		$out0,$out0,v24
+	vcipher		$out1,$out1,v24
+	vcipher		$out2,$out2,v24
+	vcipher		$out3,$out3,v24
+	vcipher		$out4,$out4,v24
+
+	subi		$inp,$inp,16
+	vcipher		$out0,$out0,v25
+	vcipher		$out1,$out1,v25
+	vcipher		$out2,$out2,v25
+	vcipher		$out3,$out3,v25
+	vcipher		$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	vcipher		$out0,$out0,v26
+	lvsr		$inpperm,r0,$taillen	# $in5 is no more
+	vcipher		$out1,$out1,v26
+	vcipher		$out2,$out2,v26
+	vcipher		$out3,$out3,v26
+	vcipher		$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vcipher		$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vcipher		$out1,$out1,v27
+	vcipher		$out2,$out2,v27
+	vcipher		$out3,$out3,v27
+	vcipher		$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vcipher		$out0,$out0,v28
+	vcipher		$out1,$out1,v28
+	vcipher		$out2,$out2,v28
+	vcipher		$out3,$out3,v28
+	vcipher		$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vcipher		$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vcipher		$out1,$out1,v29
+	vcipher		$out2,$out2,v29
+	vcipher		$out3,$out3,v29
+	vcipher		$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vcipher		$out0,$out0,v30
+	vperm		$in0,$in0,$in0,$inpperm
+	vcipher		$out1,$out1,v30
+	vcipher		$out2,$out2,v30
+	vcipher		$out3,$out3,v30
+	vcipher		$out4,$out4,v30
+
+	vcipherlast	$out0,$out0,$twk0
+	vcipherlast	$out1,$out1,$in1
+	vcipherlast	$out2,$out2,$in2
+	vcipherlast	$out3,$out3,$in3
+	vcipherlast	$out4,$out4,$in4
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+
+.align	5
+_aesp8_xts_decrypt6x:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		r0
+	li		r7,`$FRAME+8*16+15`
+	li		r8,`$FRAME+8*16+31`
+	$PUSH		r0,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	stvx		v20,r7,$sp		# ABI says so
+	addi		r7,r7,32
+	stvx		v21,r8,$sp
+	addi		r8,r8,32
+	stvx		v22,r7,$sp
+	addi		r7,r7,32
+	stvx		v23,r8,$sp
+	addi		r8,r8,32
+	stvx		v24,r7,$sp
+	addi		r7,r7,32
+	stvx		v25,r8,$sp
+	addi		r8,r8,32
+	stvx		v26,r7,$sp
+	addi		r7,r7,32
+	stvx		v27,r8,$sp
+	addi		r8,r8,32
+	stvx		v28,r7,$sp
+	addi		r7,r7,32
+	stvx		v29,r8,$sp
+	addi		r8,r8,32
+	stvx		v30,r7,$sp
+	stvx		v31,r8,$sp
+	mr		r7,r0
+	li		r0,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	mtspr		256,r0
+
+	subi		$rounds,$rounds,3	# -4 in total
+
+	lvx		$rndkey0,$x00,$key1	# load key schedule
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	lvx		v31,$x00,$key1
+	?vperm		$rndkey0,$rndkey0,v30,$keyperm
+	addi		$key_,$sp,$FRAME+15
+	mtctr		$rounds
+
+Load_xts_dec_key:
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v30,$x10,$key1
+	addi		$key1,$key1,0x20
+	stvx		v24,$x00,$key_		# off-load round[1]
+	?vperm		v25,v31,v30,$keyperm
+	lvx		v31,$x00,$key1
+	stvx		v25,$x10,$key_		# off-load round[2]
+	addi		$key_,$key_,0x20
+	bdnz		Load_xts_dec_key
+
+	lvx		v26,$x10,$key1
+	?vperm		v24,v30,v31,$keyperm
+	lvx		v27,$x20,$key1
+	stvx		v24,$x00,$key_		# off-load round[3]
+	?vperm		v25,v31,v26,$keyperm
+	lvx		v28,$x30,$key1
+	stvx		v25,$x10,$key_		# off-load round[4]
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	?vperm		v26,v26,v27,$keyperm
+	lvx		v29,$x40,$key1
+	?vperm		v27,v27,v28,$keyperm
+	lvx		v30,$x50,$key1
+	?vperm		v28,v28,v29,$keyperm
+	lvx		v31,$x60,$key1
+	?vperm		v29,v29,v30,$keyperm
+	lvx		$twk5,$x70,$key1	# borrow $twk5
+	?vperm		v30,v30,v31,$keyperm
+	lvx		v24,$x00,$key_		# pre-load round[1]
+	?vperm		v31,v31,$twk5,$keyperm
+	lvx		v25,$x10,$key_		# pre-load round[2]
+
+	 vperm		$in0,$inout,$inptail,$inpperm
+	 subi		$inp,$inp,31		# undo "caller"
+	vxor		$twk0,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out0,$in0,$twk0
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in1,$x10,$inp
+	vxor		$twk1,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in1,$in1,$in1,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out1,$in1,$twk1
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in2,$x20,$inp
+	 andi.		$taillen,$len,15
+	vxor		$twk2,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in2,$in2,$in2,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out2,$in2,$twk2
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in3,$x30,$inp
+	 sub		$len,$len,$taillen
+	vxor		$twk3,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in3,$in3,$in3,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out3,$in3,$twk3
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in4,$x40,$inp
+	 subi		$len,$len,0x60
+	vxor		$twk4,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in4,$in4,$in4,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out4,$in4,$twk4
+	vxor		$tweak,$tweak,$tmp
+
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	vxor		$twk5,$tweak,$rndkey0
+	vsrab		$tmp,$tweak,$seven	# next tweak value
+	vaddubm		$tweak,$tweak,$tweak
+	vsldoi		$tmp,$tmp,$tmp,15
+	 le?vperm	$in5,$in5,$in5,$leperm
+	vand		$tmp,$tmp,$eighty7
+	 vxor		$out5,$in5,$twk5
+	vxor		$tweak,$tweak,$tmp
+
+	vxor		v31,v31,$rndkey0
+	mtctr		$rounds
+	b		Loop_xts_dec6x
+
+.align	5
+Loop_xts_dec6x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec6x
+
+	subic		$len,$len,96		# $len-=96
+	 vxor		$in0,$twk0,v31		# xor with last round key
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk0,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out4,$out4,v24
+	vncipher	$out5,$out5,v24
+
+	subfe.		r0,r0,r0		# borrow?-1:0
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	 vxor		$in1,$twk1,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk1,$tweak,$rndkey0
+	vncipher	$out4,$out4,v25
+	vncipher	$out5,$out5,v25
+
+	and		r0,r0,$len
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out4,$out4,v26
+	vncipher	$out5,$out5,v26
+
+	add		$inp,$inp,r0		# $inp is adjusted in such
+						# way that at exit from the
+						# loop inX-in5 are loaded
+						# with last "words"
+	 vxor		$in2,$twk2,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk2,$tweak,$rndkey0
+	 vaddubm	$tweak,$tweak,$tweak
+	vncipher	$out0,$out0,v27
+	vncipher	$out1,$out1,v27
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out4,$out4,v27
+	vncipher	$out5,$out5,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	 vxor		$in3,$twk3,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk3,$tweak,$rndkey0
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipher	$out4,$out4,v28
+	vncipher	$out5,$out5,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vand		$tmp,$tmp,$eighty7
+
+	vncipher	$out0,$out0,v29
+	vncipher	$out1,$out1,v29
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	 vxor		$in4,$twk4,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk4,$tweak,$rndkey0
+	vncipher	$out4,$out4,v29
+	vncipher	$out5,$out5,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	 vand		$tmp,$tmp,$eighty7
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	 vxor		$tweak,$tweak,$tmp
+	vncipher	$out4,$out4,v30
+	vncipher	$out5,$out5,v30
+	 vxor		$in5,$twk5,v31
+	 vsrab		$tmp,$tweak,$seven	# next tweak value
+	 vxor		$twk5,$tweak,$rndkey0
+
+	vncipherlast	$out0,$out0,$in0
+	 lvx_u		$in0,$x00,$inp		# load next input block
+	 vaddubm	$tweak,$tweak,$tweak
+	 vsldoi		$tmp,$tmp,$tmp,15
+	vncipherlast	$out1,$out1,$in1
+	 lvx_u		$in1,$x10,$inp
+	vncipherlast	$out2,$out2,$in2
+	 le?vperm	$in0,$in0,$in0,$leperm
+	 lvx_u		$in2,$x20,$inp
+	 vand		$tmp,$tmp,$eighty7
+	vncipherlast	$out3,$out3,$in3
+	 le?vperm	$in1,$in1,$in1,$leperm
+	 lvx_u		$in3,$x30,$inp
+	vncipherlast	$out4,$out4,$in4
+	 le?vperm	$in2,$in2,$in2,$leperm
+	 lvx_u		$in4,$x40,$inp
+	 vxor		$tweak,$tweak,$tmp
+	vncipherlast	$out5,$out5,$in5
+	 le?vperm	$in3,$in3,$in3,$leperm
+	 lvx_u		$in5,$x50,$inp
+	 addi		$inp,$inp,0x60
+	 le?vperm	$in4,$in4,$in4,$leperm
+	 le?vperm	$in5,$in5,$in5,$leperm
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	 vxor		$out0,$in0,$twk0
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	 vxor		$out1,$in1,$twk1
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	 vxor		$out2,$in2,$twk2
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	 vxor		$out3,$in3,$twk3
+	le?vperm	$out5,$out5,$out5,$leperm
+	stvx_u		$out4,$x40,$out
+	 vxor		$out4,$in4,$twk4
+	stvx_u		$out5,$x50,$out
+	 vxor		$out5,$in5,$twk5
+	addi		$out,$out,0x60
+
+	mtctr		$rounds
+	beq		Loop_xts_dec6x		# did $len-=96 borrow?
+
+	addic.		$len,$len,0x60
+	beq		Lxts_dec6x_zero
+	cmpwi		$len,0x20
+	blt		Lxts_dec6x_one
+	nop
+	beq		Lxts_dec6x_two
+	cmpwi		$len,0x40
+	blt		Lxts_dec6x_three
+	nop
+	beq		Lxts_dec6x_four
+
+Lxts_dec6x_five:
+	vxor		$out0,$in1,$twk0
+	vxor		$out1,$in2,$twk1
+	vxor		$out2,$in3,$twk2
+	vxor		$out3,$in4,$twk3
+	vxor		$out4,$in5,$twk4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk5		# unused tweak
+	vxor		$twk1,$tweak,$rndkey0
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk1
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	le?vperm	$out4,$out4,$out4,$leperm
+	stvx_u		$out3,$x30,$out
+	stvx_u		$out4,$x40,$out
+	addi		$out,$out,0x50
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_four:
+	vxor		$out0,$in2,$twk0
+	vxor		$out1,$in3,$twk1
+	vxor		$out2,$in4,$twk2
+	vxor		$out3,$in5,$twk3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk4		# unused tweak
+	vmr		$twk1,$twk5
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk5
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	le?vperm	$out3,$out3,$out3,$leperm
+	stvx_u		$out2,$x20,$out
+	stvx_u		$out3,$x30,$out
+	addi		$out,$out,0x40
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_three:
+	vxor		$out0,$in3,$twk0
+	vxor		$out1,$in4,$twk1
+	vxor		$out2,$in5,$twk2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk3		# unused tweak
+	vmr		$twk1,$twk4
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk4
+	le?vperm	$out2,$out2,$out2,$leperm
+	stvx_u		$out1,$x10,$out
+	stvx_u		$out2,$x20,$out
+	addi		$out,$out,0x30
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_two:
+	vxor		$out0,$in4,$twk0
+	vxor		$out1,$in5,$twk1
+	vxor		$out2,$out2,$out2
+	vxor		$out3,$out3,$out3
+	vxor		$out4,$out4,$out4
+
+	bl		_aesp8_xts_dec5x
+
+	le?vperm	$out0,$out0,$out0,$leperm
+	vmr		$twk0,$twk2		# unused tweak
+	vmr		$twk1,$twk3
+	le?vperm	$out1,$out1,$out1,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	vxor		$out0,$in0,$twk3
+	stvx_u		$out1,$x10,$out
+	addi		$out,$out,0x20
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_one:
+	vxor		$out0,$in5,$twk0
+	nop
+Loop_xts_dec1x:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Loop_xts_dec1x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk0,$twk0,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	mtctr		$rounds
+	vncipherlast	$out0,$out0,$twk0
+
+	vmr		$twk0,$twk1		# unused tweak
+	vmr		$twk1,$twk2
+	le?vperm	$out0,$out0,$out0,$leperm
+	stvx_u		$out0,$x00,$out		# store output
+	addi		$out,$out,0x10
+	vxor		$out0,$in0,$twk2
+	bne		Lxts_dec6x_steal
+	b		Lxts_dec6x_done
+
+.align	4
+Lxts_dec6x_zero:
+	cmpwi		$taillen,0
+	beq		Lxts_dec6x_done
+
+	lvx_u		$in0,0,$inp
+	le?vperm	$in0,$in0,$in0,$leperm
+	vxor		$out0,$in0,$twk1
+Lxts_dec6x_steal:
+	vncipher	$out0,$out0,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		Lxts_dec6x_steal
+
+	add		$inp,$inp,$taillen
+	vncipher	$out0,$out0,v24
+
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+
+	lvx_u		$in0,0,$inp
+	vncipher	$out0,$out0,v26
+
+	lvsr		$inpperm,0,$taillen	# $in5 is no more
+	vncipher	$out0,$out0,v27
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+
+	vncipher	$out0,$out0,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$twk1,$twk1,v31
+
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out0,$out0,v30
+
+	vperm		$in0,$in0,$in0,$inpperm
+	vncipherlast	$tmp,$out0,$twk1
+
+	le?vperm	$out0,$tmp,$tmp,$leperm
+	le?stvx_u	$out0,0,$out
+	be?stvx_u	$tmp,0,$out
+
+	vxor		$out0,$out0,$out0
+	vspltisb	$out1,-1
+	vperm		$out0,$out0,$out1,$inpperm
+	vsel		$out0,$in0,$tmp,$out0
+	vxor		$out0,$out0,$twk0
+
+	subi		r3,$out,1
+	mtctr		$taillen
+Loop_xts_dec6x_steal:
+	lbzu		r0,1(r3)
+	stb		r0,16(r3)
+	bdnz		Loop_xts_dec6x_steal
+
+	li		$taillen,0
+	mtctr		$rounds
+	b		Loop_xts_dec1x		# one more time...
+
+.align	4
+Lxts_dec6x_done:
+	mtlr		r7
+	li		r10,`$FRAME+15`
+	li		r11,`$FRAME+31`
+	stvx		$seven,r10,$sp		# wipe copies of round keys
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+	stvx		$seven,r10,$sp
+	addi		r10,r10,32
+	stvx		$seven,r11,$sp
+	addi		r11,r11,32
+
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,0x04,1,0x80,6,6,0
+	.long		0
+
+.align	5
+_aesp8_xts_dec5x:
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+	lvx		v24,$x20,$key_		# round[3]
+	addi		$key_,$key_,0x20
+
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	lvx		v25,$x10,$key_		# round[4]
+	bdnz		_aesp8_xts_dec5x
+
+	subi		r0,$taillen,1
+	vncipher	$out0,$out0,v24
+	vncipher	$out1,$out1,v24
+	vncipher	$out2,$out2,v24
+	vncipher	$out3,$out3,v24
+	vncipher	$out4,$out4,v24
+
+	andi.		r0,r0,16
+	cmpwi		$taillen,0
+	vncipher	$out0,$out0,v25
+	vncipher	$out1,$out1,v25
+	vncipher	$out2,$out2,v25
+	vncipher	$out3,$out3,v25
+	vncipher	$out4,$out4,v25
+	 vxor		$twk0,$twk0,v31
+
+	sub		$inp,$inp,r0
+	vncipher	$out0,$out0,v26
+	vncipher	$out1,$out1,v26
+	vncipher	$out2,$out2,v26
+	vncipher	$out3,$out3,v26
+	vncipher	$out4,$out4,v26
+	 vxor		$in1,$twk1,v31
+
+	vncipher	$out0,$out0,v27
+	lvx_u		$in0,0,$inp
+	vncipher	$out1,$out1,v27
+	vncipher	$out2,$out2,v27
+	vncipher	$out3,$out3,v27
+	vncipher	$out4,$out4,v27
+	 vxor		$in2,$twk2,v31
+
+	addi		$key_,$sp,$FRAME+15	# rewind $key_
+	vncipher	$out0,$out0,v28
+	vncipher	$out1,$out1,v28
+	vncipher	$out2,$out2,v28
+	vncipher	$out3,$out3,v28
+	vncipher	$out4,$out4,v28
+	lvx		v24,$x00,$key_		# re-pre-load round[1]
+	 vxor		$in3,$twk3,v31
+
+	vncipher	$out0,$out0,v29
+	le?vperm	$in0,$in0,$in0,$leperm
+	vncipher	$out1,$out1,v29
+	vncipher	$out2,$out2,v29
+	vncipher	$out3,$out3,v29
+	vncipher	$out4,$out4,v29
+	lvx		v25,$x10,$key_		# re-pre-load round[2]
+	 vxor		$in4,$twk4,v31
+
+	vncipher	$out0,$out0,v30
+	vncipher	$out1,$out1,v30
+	vncipher	$out2,$out2,v30
+	vncipher	$out3,$out3,v30
+	vncipher	$out4,$out4,v30
+
+	vncipherlast	$out0,$out0,$twk0
+	vncipherlast	$out1,$out1,$in1
+	vncipherlast	$out2,$out2,$in2
+	vncipherlast	$out3,$out3,$in3
+	vncipherlast	$out4,$out4,$in4
+	mtctr		$rounds
+	blr
+        .long   	0
+        .byte   	0,12,0x14,0,0,0,0,0
+___
+}}	}}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+	# constants table endian-specific conversion
+	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+	    my $conv=$3;
+	    my @bytes=();
+
+	    # convert to endian-agnostic format
+	    if ($1 eq "long") {
+	      foreach (split(/,\s*/,$2)) {
+		my $l = /^0/?oct:int;
+		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+	      }
+	    } else {
+		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+	    }
+
+	    # little-endian conversion
+	    if ($flavour =~ /le$/o) {
+		SWITCH: for($conv)  {
+		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
+		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
+		}
+	    }
+
+	    #emit
+	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+	    next;
+	}
+	$consts=0 if (m/Lconsts:/o);	# end of table
+
+	# instructions prefixed with '?' are endian-specific and need
+	# to be adjusted accordingly...
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o	or
+	    s/\?lvsr/lvsl/o	or
+	    s/\?lvsl/lvsr/o	or
+	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+	} else {			# big-endian
+	    s/le\?/#le#/o	or
+	    s/be\?//o		or
+	    s/\?([a-z]+)/$1/o;
+	}
+
+        print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
new file mode 100644
index 0000000000..104f417c85
--- /dev/null
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -0,0 +1,968 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+#		CBC enc		CBC dec		CTR
+# Apple A7	2.39		1.20		1.20
+# Cortex-A53	2.45		1.87		1.94
+# Cortex-A57	3.64		1.34		1.32
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.globl	${prefix}_set_encrypt_key
+.type	${prefix}_set_encrypt_key,%function
+.align	5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___;
+	mov	$ptr,#-1
+	cmp	$inp,#0
+	b.eq	.Lenc_key_abort
+	cmp	$out,#0
+	b.eq	.Lenc_key_abort
+	mov	$ptr,#-2
+	cmp	$bits,#128
+	b.lt	.Lenc_key_abort
+	cmp	$bits,#256
+	b.gt	.Lenc_key_abort
+	tst	$bits,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adr	$ptr,.Lrcon
+	cmp	$bits,#192
+
+	veor	$zero,$zero,$zero
+	vld1.8	{$in0},[$inp],#16
+	mov	$bits,#8		// reuse $bits
+	vld1.32	{$rcon,$mask},[$ptr],#32
+
+	b.lt	.Loop128
+	b.eq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	b.ne	.Loop128
+
+	vld1.32	{$rcon},[$ptr]
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out]
+	add	$out,$out,#0x50
+
+	mov	$rounds,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	vld1.8	{$in1},[$inp],#8
+	vmov.i8	$key,#8			// borrow $key
+	vst1.32	{$in0},[$out],#16
+	vsub.i8	$mask,$mask,$key	// adjust the mask
+
+.Loop192:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#8
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+
+	vdup.32	$tmp,${in0}[3]
+	veor	$tmp,$tmp,$in1
+	 veor	$key,$key,$rcon
+	vext.8	$in1,$zero,$in1,#12
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in1,$in1,$tmp
+	veor	$in0,$in0,$key
+	veor	$in1,$in1,$key
+	vst1.32	{$in0},[$out],#16
+	b.ne	.Loop192
+
+	mov	$rounds,#12
+	add	$out,$out,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	vld1.8	{$in1},[$inp]
+	mov	$bits,#7
+	mov	$rounds,#14
+	vst1.32	{$in0},[$out],#16
+
+.Loop256:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out],#16
+	b.eq	.Ldone
+
+	vdup.32	$key,${in0}[3]		// just splat
+	vext.8	$tmp,$zero,$in1,#12
+	aese	$key,$zero
+
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+
+	veor	$in1,$in1,$key
+	b	.Loop256
+
+.Ldone:
+	str	$rounds,[$out]
+	mov	$ptr,#0
+
+.Lenc_key_abort:
+	mov	x0,$ptr			// return value
+	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
+	ret
+.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl	${prefix}_set_decrypt_key
+.type	${prefix}_set_decrypt_key,%function
+.align	5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	stmdb	sp!,{r4,lr}
+___
+$code.=<<___;
+	bl	.Lenc_key
+
+	cmp	x0,#0
+	b.ne	.Ldec_key_abort
+
+	sub	$out,$out,#240		// restore original $out
+	mov	x4,#-16
+	add	$inp,$out,x12,lsl#4	// end of key schedule
+
+	vld1.32	{v0.16b},[$out]
+	vld1.32	{v1.16b},[$inp]
+	vst1.32	{v0.16b},[$inp],x4
+	vst1.32	{v1.16b},[$out],#16
+
+.Loop_imc:
+	vld1.32	{v0.16b},[$out]
+	vld1.32	{v1.16b},[$inp]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	vst1.32	{v0.16b},[$inp],x4
+	vst1.32	{v1.16b},[$out],#16
+	cmp	$inp,$out
+	b.hi	.Loop_imc
+
+	vld1.32	{v0.16b},[$out]
+	aesimc	v0.16b,v0.16b
+	vst1.32	{v0.16b},[$inp]
+
+	eor	x0,x0,x0		// return value
+.Ldec_key_abort:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	ldmia	sp!,{r4,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldp	x29,x30,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl	${prefix}_${dir}crypt
+.type	${prefix}_${dir}crypt,%function
+.align	5
+${prefix}_${dir}crypt:
+	ldr	$rounds,[$key,#240]
+	vld1.32	{$rndkey0},[$key],#16
+	vld1.8	{$inout},[$inp]
+	sub	$rounds,$rounds,#2
+	vld1.32	{$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+	aes$e	$inout,$rndkey0
+	vld1.32	{$rndkey0},[$key],#16
+	aes$mc	$inout,$inout
+	subs	$rounds,$rounds,#2
+	aes$e	$inout,$rndkey1
+	vld1.32	{$rndkey1},[$key],#16
+	aes$mc	$inout,$inout
+	b.gt	.Loop_${dir}c
+
+	aes$e	$inout,$rndkey0
+	vld1.32	{$rndkey0},[$key]
+	aes$mc	$inout,$inout
+	aes$e	$inout,$rndkey1
+	veor	$inout,$inout,$rndkey0
+
+	vst1.8	{$inout},[$out]
+	ret
+.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	${prefix}_cbc_encrypt
+.type	${prefix}_cbc_encrypt,%function
+.align	5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov	ip,sp
+	stmdb	sp!,{r4-r8,lr}
+	vstmdb	sp!,{d8-d15}            @ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load remaining args
+___
+$code.=<<___;
+	subs	$len,$len,#16
+	mov	$step,#16
+	b.lo	.Lcbc_abort
+	cclr	$step,eq
+
+	cmp	$enc,#0			// en- or decrypting?
+	ldr	$rounds,[$key,#240]
+	and	$len,$len,#-16
+	vld1.8	{$ivec},[$ivp]
+	vld1.8	{$dat},[$inp],$step
+
+	vld1.32	{q8-q9},[$key]		// load key schedule...
+	sub	$rounds,$rounds,#6
+	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
+	sub	$rounds,$rounds,#2
+	vld1.32	{q10-q11},[$key_],#32
+	vld1.32	{q12-q13},[$key_],#32
+	vld1.32	{q14-q15},[$key_],#32
+	vld1.32	{$rndlast},[$key_]
+
+	add	$key_,$key,#32
+	mov	$cnt,$rounds
+	b.eq	.Lcbc_dec
+
+	cmp	$rounds,#2
+	veor	$dat,$dat,$ivec
+	veor	$rndzero_n_last,q8,$rndlast
+	b.eq	.Lcbc_enc128
+
+.Loop_cbc_enc:
+	aese	$dat,q8
+	vld1.32	{q8},[$key_],#16
+	aesmc	$dat,$dat
+	subs	$cnt,$cnt,#2
+	aese	$dat,q9
+	vld1.32	{q9},[$key_],#16
+	aesmc	$dat,$dat
+	b.gt	.Loop_cbc_enc
+
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	 subs	$len,$len,#16
+	aese	$dat,q9
+	aesmc	$dat,$dat
+	 cclr	$step,eq
+	aese	$dat,q10
+	aesmc	$dat,$dat
+	 add	$key_,$key,#16
+	aese	$dat,q11
+	aesmc	$dat,$dat
+	 vld1.8	{q8},[$inp],$step
+	aese	$dat,q12
+	aesmc	$dat,$dat
+	 veor	q8,q8,$rndzero_n_last
+	aese	$dat,q13
+	aesmc	$dat,$dat
+	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	aese	$dat,q14
+	aesmc	$dat,$dat
+	aese	$dat,q15
+
+	 mov	$cnt,$rounds
+	veor	$ivec,$dat,$rndlast
+	vst1.8	{$ivec},[$out],#16
+	b.hs	.Loop_cbc_enc
+
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	vld1.32	{$in0-$in1},[$key_]
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	 vst1.8	{$ivec},[$out],#16
+.Lenter_cbc_enc128:
+	aese	$dat,q9
+	aesmc	$dat,$dat
+	 subs	$len,$len,#16
+	aese	$dat,$in0
+	aesmc	$dat,$dat
+	 cclr	$step,eq
+	aese	$dat,$in1
+	aesmc	$dat,$dat
+	aese	$dat,q10
+	aesmc	$dat,$dat
+	aese	$dat,q11
+	aesmc	$dat,$dat
+	 vld1.8	{q8},[$inp],$step
+	aese	$dat,q12
+	aesmc	$dat,$dat
+	aese	$dat,q13
+	aesmc	$dat,$dat
+	aese	$dat,q14
+	aesmc	$dat,$dat
+	 veor	q8,q8,$rndzero_n_last
+	aese	$dat,q15
+	veor	$ivec,$dat,$rndlast
+	b.hs	.Loop_cbc_enc128
+
+	vst1.8	{$ivec},[$out],#16
+	b	.Lcbc_done
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
+.align	5
+.Lcbc_dec:
+	vld1.8	{$dat2},[$inp],#16
+	subs	$len,$len,#32		// bias
+	add	$cnt,$rounds,#2
+	vorr	$in1,$dat,$dat
+	vorr	$dat1,$dat,$dat
+	vorr	$in2,$dat2,$dat2
+	b.lo	.Lcbc_dec_tail
+
+	vorr	$dat1,$dat2,$dat2
+	vld1.8	{$dat2},[$inp],#16
+	vorr	$in0,$dat,$dat
+	vorr	$in1,$dat1,$dat1
+	vorr	$in2,$dat2,$dat2
+
+.Loop3x_cbc_dec:
+	aesd	$dat0,q8
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	vld1.32	{q8},[$key_],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	subs	$cnt,$cnt,#2
+	aesd	$dat0,q9
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	vld1.32	{q9},[$key_],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	b.gt	.Loop3x_cbc_dec
+
+	aesd	$dat0,q8
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	 veor	$tmp0,$ivec,$rndlast
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp1,$in0,$rndlast
+	aesd	$dat0,q9
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	 veor	$tmp2,$in1,$rndlast
+	 subs	$len,$len,#0x30
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vorr	$ivec,$in2,$in2
+	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
+	aesd	$dat0,q12
+	aesd	$dat1,q12
+	aesd	$dat2,q12
+	 add	$inp,$inp,x6		// $inp is adjusted in such way that
+					// at exit from the loop $dat1-$dat2
+					// are loaded with last "words"
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 mov	$key_,$key
+	aesd	$dat0,q13
+	aesd	$dat1,q13
+	aesd	$dat2,q13
+	 vld1.8	{$in0},[$inp],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vld1.8	{$in1},[$inp],#16
+	aesd	$dat0,q14
+	aesd	$dat1,q14
+	aesd	$dat2,q14
+	 vld1.8	{$in2},[$inp],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	aesd	$dat0,q15
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+
+	 add	$cnt,$rounds,#2
+	veor	$tmp0,$tmp0,$dat0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat2,$dat2,$tmp2
+	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	 vorr	$dat0,$in0,$in0
+	vst1.8	{$tmp0},[$out],#16
+	 vorr	$dat1,$in1,$in1
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$dat2},[$out],#16
+	 vorr	$dat2,$in2,$in2
+	b.hs	.Loop3x_cbc_dec
+
+	cmn	$len,#0x30
+	b.eq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	vld1.32	{q8},[$key_],#16
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	subs	$cnt,$cnt,#2
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	vld1.32	{q9},[$key_],#16
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	b.gt	.Lcbc_dec_tail
+
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q12
+	aesd	$dat2,q12
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 cmn	$len,#0x20
+	aesd	$dat1,q13
+	aesd	$dat2,q13
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp1,$ivec,$rndlast
+	aesd	$dat1,q14
+	aesd	$dat2,q14
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp2,$in1,$rndlast
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+	b.eq	.Lcbc_dec_one
+	veor	$tmp1,$tmp1,$dat1
+	veor	$tmp2,$tmp2,$dat2
+	 vorr	$ivec,$in2,$in2
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$tmp2},[$out],#16
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	veor	$tmp1,$tmp1,$dat2
+	 vorr	$ivec,$in2,$in2
+	vst1.8	{$tmp1},[$out],#16
+
+.Lcbc_done:
+	vst1.8	{$ivec},[$ivp]
+.Lcbc_abort:
+___
+}
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r8,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr	x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12";		# aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	${prefix}_ctr32_encrypt_blocks
+.type	${prefix}_ctr32_encrypt_blocks,%function
+.align	5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov		ip,sp
+	stmdb		sp!,{r4-r10,lr}
+	vstmdb		sp!,{d8-d15}            @ ABI specification says so
+	ldr		r4, [ip]		@ load remaining arg
+___
+$code.=<<___;
+	ldr		$rounds,[$key,#240]
+
+	ldr		$ctr, [$ivp, #12]
+	vld1.32		{$dat0},[$ivp]
+
+	vld1.32		{q8-q9},[$key]		// load key schedule...
+	sub		$rounds,$rounds,#4
+	mov		$step,#16
+	cmp		$len,#2
+	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
+	sub		$rounds,$rounds,#2
+	vld1.32		{q12-q13},[$key_],#32
+	vld1.32		{q14-q15},[$key_],#32
+	vld1.32		{$rndlast},[$key_]
+	add		$key_,$key,#32
+	mov		$cnt,$rounds
+	cclr		$step,lo
+#ifndef __ARMEB__
+	rev		$ctr, $ctr
+#endif
+	vorr		$dat1,$dat0,$dat0
+	add		$tctr1, $ctr, #1
+	vorr		$dat2,$dat0,$dat0
+	add		$ctr, $ctr, #2
+	vorr		$ivec,$dat0,$dat0
+	rev		$tctr1, $tctr1
+	vmov.32		${dat1}[3],$tctr1
+	b.ls		.Lctr32_tail
+	rev		$tctr2, $ctr
+	sub		$len,$len,#3		// bias
+	vmov.32		${dat2}[3],$tctr2
+	b		.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aese		$dat2,q8
+	vld1.32		{q8},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aesmc		$dat2,$dat2
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aese		$dat1,q9
+	aese		$dat2,q9
+	vld1.32		{q9},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aesmc		$dat2,$dat2
+	b.gt		.Loop3x_ctr32
+
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aese		$dat2,q8
+	 mov		$key_,$key
+	aesmc		$tmp0,$dat0
+	 vld1.8		{$in0},[$inp],#16
+	aesmc		$tmp1,$dat1
+	aesmc		$dat2,$dat2
+	 vorr		$dat0,$ivec,$ivec
+	aese		$tmp0,q9
+	 vld1.8		{$in1},[$inp],#16
+	aese		$tmp1,q9
+	aese		$dat2,q9
+	 vorr		$dat1,$ivec,$ivec
+	aesmc		$tmp0,$tmp0
+	 vld1.8		{$in2},[$inp],#16
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$dat2
+	 vorr		$dat2,$ivec,$ivec
+	 add		$tctr0,$ctr,#1
+	aese		$tmp0,q12
+	aese		$tmp1,q12
+	aese		$tmp2,q12
+	 veor		$in0,$in0,$rndlast
+	 add		$tctr1,$ctr,#2
+	aesmc		$tmp0,$tmp0
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 veor		$in1,$in1,$rndlast
+	 add		$ctr,$ctr,#3
+	aese		$tmp0,q13
+	aese		$tmp1,q13
+	aese		$tmp2,q13
+	 veor		$in2,$in2,$rndlast
+	 rev		$tctr0,$tctr0
+	aesmc		$tmp0,$tmp0
+	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat0}[3], $tctr0
+	 rev		$tctr1,$tctr1
+	aese		$tmp0,q14
+	aese		$tmp1,q14
+	aese		$tmp2,q14
+	 vmov.32	${dat1}[3], $tctr1
+	 rev		$tctr2,$ctr
+	aesmc		$tmp0,$tmp0
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat2}[3], $tctr2
+	 subs		$len,$len,#3
+	aese		$tmp0,q15
+	aese		$tmp1,q15
+	aese		$tmp2,q15
+
+	 mov		$cnt,$rounds
+	veor		$in0,$in0,$tmp0
+	veor		$in1,$in1,$tmp1
+	veor		$in2,$in2,$tmp2
+	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	vst1.8		{$in0},[$out],#16
+	vst1.8		{$in1},[$out],#16
+	vst1.8		{$in2},[$out],#16
+	b.hs		.Loop3x_ctr32
+
+	adds		$len,$len,#3
+	b.eq		.Lctr32_done
+	cmp		$len,#1
+	mov		$step,#16
+	cclr		$step,eq
+
+.Lctr32_tail:
+	aese		$dat0,q8
+	aese		$dat1,q8
+	vld1.32		{q8},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aese		$dat1,q9
+	vld1.32		{q9},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	b.gt		.Lctr32_tail
+
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q9
+	aese		$dat1,q9
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in0},[$inp],$step
+	aese		$dat0,q12
+	aese		$dat1,q12
+	 vld1.8		{$in1},[$inp]
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q13
+	aese		$dat1,q13
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q14
+	aese		$dat1,q14
+	 veor		$in0,$in0,$rndlast
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	 veor		$in1,$in1,$rndlast
+	aese		$dat0,q15
+	aese		$dat1,q15
+
+	cmp		$len,#1
+	veor		$in0,$in0,$dat0
+	veor		$in1,$in1,$dat1
+	vst1.8		{$in0},[$out],#16
+	b.eq		.Lctr32_done
+	vst1.8		{$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+	ldmia		sp!,{r4-r10,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr		x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) {			######## 64-bit code
+    my %opcode = (
+	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
+	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5),
+			$mnemonic,$arg;
+    };
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;			# old->new style commentary
+
+	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
+	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
+	s/vext\.8/ext/o		or
+	s/vrev32\.8/rev32/o	or
+	s/vtst\.8/cmtst/o	or
+	s/vshr/ushr/o		or
+	s/^(\s+)v/$1/o		or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8//o;
+	m/\],#8/o and s/\.16b/\.8b/go;
+	s/\.[ui]?32//o and s/\.16b/\.4s/go;
+	s/\.[ui]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    my %opcode = (
+	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
+	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<1) |(($2&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    };
+
+    sub unvtbl {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
+		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	
+    }
+
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	
+    }
+
+    sub unvmov32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	
+    }
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
+	s/\],#[0-9]+/]!/o;
+
+	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
+	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
+	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
+	s/^(\s+)b\./$1b/o				or
+	s/^(\s+)mov\./$1mov/o				or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT;
diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
new file mode 100644
index 0000000000..bfec664198
--- /dev/null
+++ b/crypto/arm64cpuid.pl
@@ -0,0 +1,68 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+.arch	armv8-a+crypto
+
+.align	5
+.globl	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	orr	v15.16b, v15.16b, v15.16b
+	ret
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.globl	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+#ifdef	__APPLE__
+	mrs	x0, CNTPCT_EL0
+#else
+	mrs	x0, CNTVCT_EL0
+#endif
+	ret
+.size	_armv7_tick,.-_armv7_tick
+
+.globl	_armv8_aes_probe
+.type	_armv8_aes_probe,%function
+_armv8_aes_probe:
+	aese	v0.16b, v0.16b
+	ret
+.size	_armv8_aes_probe,.-_armv8_aes_probe
+
+.globl	_armv8_sha1_probe
+.type	_armv8_sha1_probe,%function
+_armv8_sha1_probe:
+	sha1h	s0, s0
+	ret
+.size	_armv8_sha1_probe,.-_armv8_sha1_probe
+
+.globl	_armv8_sha256_probe
+.type	_armv8_sha256_probe,%function
+_armv8_sha256_probe:
+	sha256su0	v0.4s, v0.4s
+	ret
+.size	_armv8_sha256_probe,.-_armv8_sha256_probe
+.globl	_armv8_pmull_probe
+.type	_armv8_pmull_probe,%function
+_armv8_pmull_probe:
+	pmull	v0.1q, v0.1d, v0.1d
+	ret
+.size	_armv8_pmull_probe,.-_armv8_pmull_probe
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index a50c366976..7a377758eb 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -10,13 +10,22 @@
 #   define __ARMEL__
 #  endif
 # elif defined(__GNUC__)
+#  if  defined(__aarch64__)
+#   define __ARM_ARCH__ 8
+#   if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+#    define __ARMEB__
+#   else
+#    define __ARMEL__
+#   endif
   /*
    * Why doesn't gcc define __ARM_ARCH__? Instead it defines
    * bunch of below macros. See all_architectires[] table in
    * gcc/config/arm/arm.c. On a side note it defines
    * __ARMEL__/__ARMEB__ for little-/big-endian.
    */
-#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+#  elif defined(__ARM_ARCH_8A__)
+#    define __ARM_ARCH__ 8
+#  elif	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
 	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
 	defined(__ARM_ARCH_7EM__)
 #   define __ARM_ARCH__ 7
@@ -42,10 +51,14 @@
 
 #if !__ASSEMBLER__
 extern unsigned int OPENSSL_armcap_P;
+#endif
                                      
 #define ARMV7_NEON      (1<<0)
 #define ARMV7_TICK      (1<<1)
-#endif
+#define ARMV8_AES       (1<<2)
+#define ARMV8_SHA1      (1<<3)
+#define ARMV8_SHA256    (1<<4)
+#define ARMV8_PMULL     (1<<5)
 
 #endif
 #endif
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 8dbd741087..2579389ffd 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -20,6 +20,10 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
  */
 void _armv7_neon_probe(void);
 unsigned int _armv7_tick(void);
+void _armv8_aes_probe(void);
+void _armv8_sha1_probe(void);
+void _armv8_sha256_probe(void);
+void _armv8_pmull_probe(void);
 
 unsigned int OPENSSL_rdtsc(void)
 	{
@@ -30,7 +34,7 @@ unsigned int OPENSSL_rdtsc(void)
 	}
 
 #if defined(__GNUC__) && __GNUC__>=2
-void OPENSSL_cpuid_setup(void) __attribute__((constructor))
+void OPENSSL_cpuid_setup(void) __attribute__((constructor));
 #endif
 void OPENSSL_cpuid_setup(void)
 	{
@@ -68,6 +72,28 @@ void OPENSSL_cpuid_setup(void)
 		{
 		_armv7_neon_probe();
 		OPENSSL_armcap_P |= ARMV7_NEON;
+#ifdef __aarch64__
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_pmull_probe();
+			OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES;
+			}
+		else if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_aes_probe();
+			OPENSSL_armcap_P |= ARMV8_AES;
+			}
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_sha1_probe();
+			OPENSSL_armcap_P |= ARMV8_SHA1;
+			}
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_sha256_probe();
+			OPENSSL_armcap_P |= ARMV8_SHA256;
+			}
+#endif
 		}
 	if (sigsetjmp(ill_jmp,1) == 0)
 		{
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index c9102ca2a5..2d618deaa4 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -44,7 +44,7 @@ OPENSSL_atomic_add:
 	bne	.Lspin
 
 	ldr	r2,[r4]
-	add	r2,r5
+	add	r2,r2,r5
 	str	r2,[r4]
 	str	r0,[r6]		@ release spinlock
 	ldmia	sp!,{r4-r6,lr}
@@ -59,26 +59,26 @@ OPENSSL_atomic_add:
 OPENSSL_cleanse:
 	eor	ip,ip,ip
 	cmp	r1,#7
-	subhs	r1,#4
+	subhs	r1,r1,#4
 	bhs	.Lot
 	cmp	r1,#0
 	beq	.Lcleanse_done
 .Little:
 	strb	ip,[r0],#1
-	subs	r1,#1
+	subs	r1,r1,#1
 	bhi	.Little
 	b	.Lcleanse_done
 
 .Lot:	tst	r0,#3
 	beq	.Laligned
 	strb	ip,[r0],#1
-	sub	r1,#1
+	sub	r1,r1,#1
 	b	.Lot
 .Laligned:
 	str	ip,[r0],#4
-	subs	r1,#4
+	subs	r1,r1,#4
 	bhs	.Laligned
-	adds	r1,#4
+	adds	r1,r1,#4
 	bne	.Little
 .Lcleanse_done:
 	tst	lr,#1
diff --git a/crypto/armv4cpuid_ios.S b/crypto/armv4cpuid_ios.S
new file mode 100644
index 0000000000..cce9a7902b
--- /dev/null
+++ b/crypto/armv4cpuid_ios.S
@@ -0,0 +1,210 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.globl	_OPENSSL_atomic_add
+
+_OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	Ladd
+	mov	r0,r3
+	bx	lr
+#else
+	stmdb	sp!,{r4,r5,r6,lr}
+	ldr	r2,Lspinlock
+	adr	r3,Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4,r5,r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_cleanse
+
+_OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	Lot
+	cmp	r1,#0
+	beq	Lcleanse_done
+Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	Little
+	b	Lcleanse_done
+
+Lot:	tst	r0,#3
+	beq	Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	Lot
+Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	Laligned
+	adds	r1,r1,#4
+	bne	Little
+Lcleanse_done:
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+
+
+.align	5
+.globl	__armv7_neon_probe
+
+__armv7_neon_probe:
+	vorr	q0,q0,q0
+	bx	lr
+
+
+.globl	__armv7_tick
+
+__armv7_tick:
+#ifdef	__APPLE__
+	mrrc	p15,0,r0,r1,c14		@ CNTPCT
+#else
+	mrrc	p15,1,r0,r1,c14		@ CNTVCT
+#endif
+	bx	lr
+
+
+.globl	__armv8_aes_probe
+
+__armv8_aes_probe:
+.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0
+	bx	lr
+
+
+.globl	__armv8_sha1_probe
+
+__armv8_sha1_probe:
+.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0
+	bx	lr
+
+
+.globl	__armv8_sha256_probe
+
+__armv8_sha256_probe:
+.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0
+	bx	lr
+
+.globl	__armv8_pmull_probe
+
+__armv8_pmull_probe:
+.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0
+	bx	lr
+
+.globl	_OPENSSL_wipe_cpu
+
+_OPENSSL_wipe_cpu:
+	ldr	r0,LOPENSSL_armcap
+	adr	r1,LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	Lwipe_done
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+	veor	q3, q3, q3
+	veor	q8, q8, q8
+	veor	q9, q9, q9
+	veor	q10, q10, q10
+	veor	q11, q11, q11
+	veor	q12, q12, q12
+	veor	q13, q13, q13
+	veor	q14, q14, q14
+	veor	q15, q15, q15
+Lwipe_done:
+	mov	r0,sp
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_instrument_bus
+
+_OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_instrument_bus2
+
+_OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.align	5
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.
+#if __ARM_ARCH__>=6
+.align	5
+#else
+Lspinlock:
+.word	atomic_add_spinlock-Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word
+#endif
+
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 9928dae872..737659f0db 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -21,8 +21,20 @@
 # runs in even less cycles, ~30, improvement is measurable only on
 # longer keys. One has to optimize code elsewhere to get NEON glow...
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
@@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
 .Lpic:	ldr	r12,[pc,r12]
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#1
 	beq	.Lialu
 
 	veor	$A1,$A1
+#ifdef	__APPLE__
+	vmov	$B1,r3,r3		@ two copies of b1
+#else
 	vmov.32	$B1,r3,r3		@ two copies of b1
+#endif
 	vmov.32	${A1}[0],r1		@ a1
 
 	veor	$A0,$A0
@@ -218,38 +237,38 @@ $code.=<<___;
 	mov	$b,r3			@ $b=b1
 	ldr	r3,[sp,#32]		@ load b0
 	mov	$mask,#7<<2
-	sub	sp,#32			@ allocate tab[8]
+	sub	sp,sp,#32		@ allocate tab[8]
 
 	bl	mul_1x1_ialu		@ a1�b1
 	str	$lo,[$ret,#8]
 	str	$hi,[$ret,#12]
 
-	eor	$b,r3			@ flip b0 and b1
-	 eor	$a,r2			@ flip a0 and a1
-	eor	r3,$b
-	 eor	r2,$a
-	eor	$b,r3
-	 eor	$a,r2
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
 	bl	mul_1x1_ialu		@ a0�b0
 	str	$lo,[$ret]
 	str	$hi,[$ret,#4]
 
-	eor	$a,r2
-	eor	$b,r3
+	eor	$a,$a,r2
+	eor	$b,$b,r3
 	bl	mul_1x1_ialu		@ (a1+a0)�(b1+b0)
 ___
 @r=map("r$_",(6..9));
 $code.=<<___;
 	ldmia	$ret,{@r[0]-@r[3]}
-	eor	$lo,$hi
-	eor	$hi,@r[1]
-	eor	$lo,@r[0]
-	eor	$hi,@r[2]
-	eor	$lo,@r[3]
-	eor	$hi,@r[3]
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
 	str	$hi,[$ret,#8]
-	eor	$lo,$hi
-	add	sp,#32			@ destroy tab[8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
 	str	$lo,[$ret,#4]
 
 #if __ARM_ARCH__>=5
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index f78a8b5f0f..aa00f38c2f 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,8 +23,20 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm
new file mode 100644
index 0000000000..161547c3b0
--- /dev/null
+++ b/crypto/bn/asm/bn-c64xplus.asm
@@ -0,0 +1,333 @@
+;;====================================================================
+;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+;; project.
+;;
+;; Rights for redistribution and usage in source and binary forms are
+;; granted according to the OpenSSL license. Warranty of any kind is
+;; disclaimed.
+;;====================================================================
+;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
+;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
+;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
+;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
+;;====================================================================
+	.text
+
+	.asg	B3,RA
+	.asg	A4,ARG0
+	.asg	B4,ARG1
+	.asg	A6,ARG2
+	.asg	B6,ARG3
+	.asg	A8,ARG4
+	.asg	B8,ARG5
+	.asg	A4,RET
+	.asg	A15,FP
+	.asg	B14,DP
+	.asg	B15,SP
+
+	.global	_bn_mul_add_words
+_bn_mul_add_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+|| [B0]	MV	ARG0,A2
+|| [B0]	MV	ARG3,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	3
+	LDW	*ARG0++,A7	; rp[i]
+	MPY32U	B7,A3,A17:A16
+	NOP	3		; [2,0] in epilogue
+	ADDU	A16,A7,A21:A20
+	ADDU	A19,A21:A20,A19:A18
+||	MV.S	A17,A23
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*A2++	; rp[i]
+||	ADD	A19,A23,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_mul_words
+_bn_mul_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,A7	; ap[i]
+	NOP	4
+	MPY32U	A7,ARG3,A17:A16
+	NOP	4		; [2,0] in epiloque
+	ADDU	A19,A16,A19:A18
+||	MV.S	A17,A21
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*ARG0++	; rp[i]
+||	ADD.L	A19,A21,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_sqr_words
+_bn_sqr_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	MV	ARG0,B2
+|| [B0]	ADD	4,ARG0,ARG0
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	MPY32U	B7,B7,B1:B0
+	NOP	3		; [2,0] in epilogue
+	STW	B0,*B2++(8)	; rp[2*i]
+	MV	B1,A1
+	SPKERNEL 2,0		; fully overlap BNOP RA,5
+||	STW	A1,*ARG0++(8)	; rp[2*i+1]
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_add_words
+_bn_add_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A1		; carry flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	ADDU	A7,B7,A9:A8
+	ADDU	A1,A9:A8,A1:A0
+	SPKERNEL 0,0		; fully overlap BNOP RA,5
+||	STW	A0,*A3++	; write result
+||	MV	A1,RET		; keep carry flag in RET
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_sub_words
+_bn_sub_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A2		; borrow flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	SUBU	B7,A7,A1:A0
+  [A2]	SUB	A1:A0,1,A1:A0
+	SPKERNEL 0,1		; leave slot for "return borrow flag"
+||	STW	A0,*A3++	; write result
+||	AND	1,A1,A2		; pass on borrow flag
+;;====================================================================
+	BNOP	RA,4
+	AND	1,A1,RET	; return borrow flag
+	.endasmfunc
+
+	.global	_bn_div_words
+	.global	__divull
+_bn_div_words:
+	.asmfunc
+	CALLP	__divull,A3	; jump to rts64plus.lib
+||	MV	ARG0,A5
+||	MV	ARG1,ARG0
+||	MV	ARG2,ARG1
+||	ZERO	B5
+	.endasmfunc
+
+;;====================================================================
+;; Not really Comba algorithm, just straightforward NxM... Dedicated
+;; fully unrolled real Comba implementations are asymptotically 2x
+;; faster, but naturally larger undertaking. Purpose of this exercise
+;; was rather to learn to master nested SPLOOPs...
+;;====================================================================
+	.global	_bn_sqr_comba8
+	.global	_bn_mul_comba8
+_bn_sqr_comba8:
+	MV	ARG1,ARG2
+_bn_mul_comba8:
+	.asmfunc
+	MVK	8,B0		; N, RILC
+||	MVK	8,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; N-2, initial ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+sploopNxM?:			; for best performance arrange M<=N
+   [A0]	SPLOOPD	2		; 2*n+10
+||	MVC	B1,ILC
+||	ADDAW	B4,B0,B5
+||	ZERO	B7
+||	LDW	*A5++,A9	; pre-fetch ap[1]
+||	ZERO	A1
+||	SUB	A0,1,A0
+;;====================================================================
+;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
+;; This is because of Advisory 15 from TI publication SPRZ247I.
+	LDW	*ARG2++,A7	; bp[i]
+	NOP	3
+   [A1]	LDW	*B5++,B7	; rp[i]
+	MPY32U	A7,B6,B17:B16
+	NOP	3
+	ADDU	B16,B7,B21:B20
+	ADDU	B19,B21:B20,B19:B18
+||	MV.S	B17,B23
+	SPKERNEL
+||	STW	B18,*B4++	; rp[i]
+||	ADD.S	B19,B23,B19
+;;====================================================================
+outer?:				; m*2*(n+1)+10
+	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
+	SPMASKR
+||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
+	MVD	A9,B6		; move through .M unit(*)
+   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
+	SUBAW	B5,B2,B5	; rewind rp to rp[1]
+	MVK	1,A1
+   [A0]	BNOP.S1	outer?,4
+|| [A0]	SUB.L	A0,1,A0
+	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
+||	ZERO.S	B19		; high part of accumulator
+;; end of outer?
+	BNOP	RA,5		; return
+	.endasmfunc
+;; (*)	It should be noted that B6 is used as input to MPY32U in
+;;	chronologically next cycle in *preceding* SPLOOP iteration.
+;;	Normally such arrangement would require DINT, but at this
+;;	point SPLOOP is draining and interrupts are disabled
+;;	implicitly.
+
+	.global	_bn_sqr_comba4
+	.global	_bn_mul_comba4
+_bn_sqr_comba4:
+	MV	ARG1,ARG2
+_bn_mul_comba4:
+	.asmfunc
+	.if	0
+	BNOP	sploopNxM?,3
+	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
+	;; because of read-after-write penalties, it's rather
+	;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
+	MVK	4,B0		; N, RILC
+||	MVK	4,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; first ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+	.else
+	;; This alternative is exercise in fully unrolled Comba
+	;; algorithm implementation that operates at n*(n+1)+12, or
+	;; as little as 32 cycles...
+	LDW	*ARG1[0],B16	; a[0]
+||	LDW	*ARG2[0],A16	; b[0]
+	LDW	*ARG1[1],B17	; a[1]
+||	LDW	*ARG2[1],A17	; b[1]
+	LDW	*ARG1[2],B18	; a[2]
+||	LDW	*ARG2[2],A18	; b[2]
+	LDW	*ARG1[3],B19	; a[3]
+||	LDW	*ARG2[3],A19	; b[3]
+	NOP
+	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
+	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
+	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
+	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
+	STW	A0,*ARG0[0]
+||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
+	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
+||	ADDU	A22,A1,A1:A0
+	MV	A23,B0
+||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B0,B1:B0
+||	STW	A0,*ARG0[1]
+||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
+||	ADDU	A26,A1,A9:A8
+	ADDU	A27,B1,B9:B8
+||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
+||	ADDU	A28,A9:A8,A9:A8
+	ADDU	A29,B9:B8,B9:B8
+||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
+||	ADDU	A30,A9:A8,A9:A8
+	ADDU	A31,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[2]
+||	ADDU	A20,A9,A1:A0
+	ADDU	A21,B9,B1:B0
+||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
+||	ADDU	A22,A1:A0,A1:A0
+	ADDU	A23,B1:B0,B1:B0
+||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B1:B0,B1:B0
+||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
+||	ADDU	A26,A1:A0,A1:A0
+	ADDU	A27,B1:B0,B1:B0
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[3]
+||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
+||	ADDU	A20,A1,A9:A8
+	ADDU	A21,B1,B9:B8
+||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
+||	ADDU	A22,A9:A8,A9:A8
+	ADDU	A23,B9:B8,B9:B8
+||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
+||	ADDU	A24,A9:A8,A9:A8
+	ADDU	A25,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[4]
+||	ADDU	A26,A9,A1:A0
+	ADDU	A27,B9,B1:B0
+||	ADDU	A28,A1:A0,A1:A0
+	ADDU	A29,B1:B0,B1:B0
+||	BNOP	RA
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[5]
+||	ADDU	A30,A1,A9:A8
+	ADD	A31,B1,B8
+	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
+	ADD	B8,A9,A9
+||	STW	A8,*ARG0[6]
+	STW	A9,*ARG0[7]
+	.endif
+	.endasmfunc
diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl
new file mode 100644
index 0000000000..cef83942c9
--- /dev/null
+++ b/crypto/bn/asm/c64xplus-gf2m.pl
@@ -0,0 +1,146 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# February 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... The subroutine runs in 37 cycles, which is
+# 4.5x faster than compiler-generated code. Though comparison is
+# totally unfair, because this module utilizes Galois Field Multiply
+# instruction.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
+
+($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
+($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
+($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
+($A,$B)=($Alo,$B_1);
+$xFF="B1";
+
+sub mul_1x1_upper {
+my ($A,$B)=@_;
+$code.=<<___;
+	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	AND	$B,$xFF,$B_0
+||	SHRU	$B,24,$B_3
+	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	EXTU	$A,16,16,$Alo
+
+	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits muliplication
+||	XORMPY	$Ahi,$B_2,$Ahix2
+||	EXTU	$B,16,24,$B_1
+	XORMPY	$Alo,$B_0,$Alox0
+||	XORMPY	$Ahi,$B_0,$Ahix0
+	XORMPY	$Alo,$B_3,$Alox3
+||	XORMPY	$Ahi,$B_3,$Ahix3
+	XORMPY	$Alo,$B_1,$Alox1
+||	XORMPY	$Ahi,$B_1,$Ahix1
+___
+}
+sub mul_1x1_merged {
+my ($OUTlo,$OUThi,$A,$B)=@_;
+$code.=<<___;
+	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	 AND	$B,$xFF,$B_0
+||	 SHRU	$B,24,$B_3
+	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	 EXTU	$A,16,16,$Alo
+
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+||	 XORMPY	$Alo,$B_2,$Alox2
+	 XORMPY	$Ahi,$B_2,$Ahix2
+||	 EXTU	$B,16,24,$B_1
+||	 XORMPY	$Alo,$B_0,A1		; $Alox0
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_0,$Ahix0
+||	 XORMPY	$Alo,$B_3,$Alox3
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_3,$Ahix3
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+||	 XORMPY	$Alo,$B_1,$Alox1
+||	 XORMPY	$Ahi,$B_1,$Ahix1
+||	 MV	A1,$Alox0
+___
+}
+sub mul_1x1_lower {
+my ($OUTlo,$OUThi)=@_;
+$code.=<<___;
+	;NOP
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+	NOP
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+___
+}
+$code.=<<___;
+	.text
+
+	.global	_bn_GF2m_mul_2x2
+_bn_GF2m_mul_2x2:
+	.asmfunc
+	MVK	0xFF,$xFF
+___
+	&mul_1x1_upper($a0,$b0);		# a0�b0
+$code.=<<___;
+||	MV	$b1,$B
+	MV	$a1,$A
+___
+	&mul_1x1_merged("A28","B28",$A,$B);	# a0�b0/a1�b1
+$code.=<<___;
+||	XOR	$b0,$b1,$B
+	XOR	$a0,$a1,$A
+___
+	&mul_1x1_merged("A31","B31",$A,$B);	# a1�b1/(a0+a1)�(b0+b1)
+$code.=<<___;
+	XOR	A28,A31,A29
+||	XOR	B28,B31,B29			; a0�b0+a1�b1
+___
+	&mul_1x1_lower("A30","B30");		# (a0+a1)�(b0+b1)
+$code.=<<___;
+||	BNOP	B3
+	XOR	A29,A30,A30
+||	XOR	B29,B30,B30			; (a0+a1)�(b0+b1)-a0�b0-a1�b1
+	XOR	B28,A30,A30
+||	STW	A28,*${rp}[0]
+	XOR	B30,A31,A31
+||	STW	A30,*${rp}[1]
+	STW	A31,*${rp}[2]
+	STW	B31,*${rp}[3]
+	.endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
index b944a12b8e..a33cdf4111 100644
--- a/crypto/bn/asm/mips-mont.pl
+++ b/crypto/bn/asm/mips-mont.pl
@@ -46,7 +46,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
 
 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -133,7 +133,7 @@ $code.=<<___;
 	bnez	$at,1f
 	li	$t0,0
 	slt	$at,$num,17	# on in-order CPU
-	bnezl	$at,bn_mul_mont_internal
+	bnez	$at,bn_mul_mont_internal
 	nop
 1:	jr	$ra
 	li	$a0,0
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
index acfd35968e..acafde5e56 100644
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
@@ -48,7 +48,7 @@
 # has to content with 40-85% improvement depending on benchmark and
 # key length, more for longer keys.
 
-$flavour = shift;
+$flavour = shift || "o32";
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -140,10 +140,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_mul_add_words_tail
 
 .L_bn_mul_add_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$a3
 	$LD	$t1,0($a0)
 	$LD	$t2,$BNSZ($a1)
@@ -200,10 +200,9 @@ $code.=<<___;
 	$ADDU	$v0,$ta2
 	sltu	$at,$ta3,$at
 	$ST	$ta3,-$BNSZ($a0)
-	$ADDU	$v0,$at
 	.set	noreorder
-	bgtzl	$ta0,.L_bn_mul_add_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_mul_add_words_loop
+	$ADDU	$v0,$at
 
 	beqz	$a2,.L_bn_mul_add_words_return
 	nop
@@ -267,7 +266,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	move	$a0,$v0
-.end	bn_mul_add_words
+.end	bn_mul_add_words_internal
 
 .align	5
 .globl	bn_mul_words
@@ -300,10 +299,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_mul_words_tail
 
 .L_bn_mul_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$a3
 	$LD	$t2,$BNSZ($a1)
 	$LD	$ta0,2*$BNSZ($a1)
@@ -341,10 +340,9 @@ $code.=<<___;
 	$ADDU	$v0,$at
 	sltu	$ta3,$v0,$at
 	$ST	$v0,-$BNSZ($a0)
-	$ADDU	$v0,$ta3,$ta2
 	.set	noreorder
-	bgtzl	$ta0,.L_bn_mul_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_mul_words_loop
+	$ADDU	$v0,$ta3,$ta2
 
 	beqz	$a2,.L_bn_mul_words_return
 	nop
@@ -429,10 +427,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_sqr_words_tail
 
 .L_bn_sqr_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$t0
 	$LD	$t2,$BNSZ($a1)
 	$LD	$ta0,2*$BNSZ($a1)
@@ -463,11 +461,10 @@ $code.=<<___;
 	mflo	$ta3
 	mfhi	$ta2
 	$ST	$ta3,-2*$BNSZ($a0)
-	$ST	$ta2,-$BNSZ($a0)
 
 	.set	noreorder
-	bgtzl	$ta0,.L_bn_sqr_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_sqr_words_loop
+	$ST	$ta2,-$BNSZ($a0)
 
 	beqz	$a2,.L_bn_sqr_words_return
 	nop
@@ -547,10 +544,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$at,$a3,$minus4
-	$LD	$t0,0($a1)
 	beqz	$at,.L_bn_add_words_tail
 
 .L_bn_add_words_loop:
+	$LD	$t0,0($a1)
 	$LD	$ta0,0($a2)
 	subu	$a3,4
 	$LD	$t1,$BNSZ($a1)
@@ -589,11 +586,10 @@ $code.=<<___;
 	$ADDU	$t3,$ta3,$v0
 	sltu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
-	$ADDU	$v0,$t9
 	
 	.set	noreorder
-	bgtzl	$at,.L_bn_add_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$at,.L_bn_add_words_loop
+	$ADDU	$v0,$t9
 
 	beqz	$a3,.L_bn_add_words_return
 	nop
@@ -679,10 +675,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$at,$a3,$minus4
-	$LD	$t0,0($a1)
 	beqz	$at,.L_bn_sub_words_tail
 
 .L_bn_sub_words_loop:
+	$LD	$t0,0($a1)
 	$LD	$ta0,0($a2)
 	subu	$a3,4
 	$LD	$t1,$BNSZ($a1)
@@ -722,11 +718,10 @@ $code.=<<___;
 	$SUBU	$t3,$ta3,$v0
 	sgtu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
-	$ADDU	$v0,$t9
 
 	.set	noreorder
-	bgtzl	$at,.L_bn_sub_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$at,.L_bn_sub_words_loop
+	$ADDU	$v0,$t9
 
 	beqz	$a3,.L_bn_sub_words_return
 	nop
@@ -778,7 +773,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	move	$a0,$v0
-.end	bn_sub_words
+.end	bn_sub_words_internal
 
 .align 5
 .globl	bn_div_3_words
@@ -819,7 +814,7 @@ ___
 $code.=<<___;
 	.set	reorder
 	move	$ta3,$ra
-	bal	bn_div_words
+	bal	bn_div_words_internal
 	move	$ra,$ta3
 	$MULTU	$ta2,$v0
 	$LD	$t2,-2*$BNSZ($a3)
@@ -840,8 +835,9 @@ $code.=<<___;
 	sltu	$ta0,$a1,$a2
 	or	$t8,$ta0
 	.set	noreorder
-	beqzl	$at,.L_bn_div_3_words_inner_loop
+	beqz	$at,.L_bn_div_3_words_inner_loop
 	$SUBU	$v0,1
+	$ADDU	$v0,1
 	.set	reorder
 .L_bn_div_3_words_inner_loop_done:
 	.set	noreorder
@@ -902,7 +898,8 @@ $code.=<<___;
 	and	$t2,$a0
 	$SRL	$at,$a1,$t1
 	.set	noreorder
-	bnezl	$t2,.+8
+	beqz	$t2,.+12
+	nop
 	break	6		# signal overflow
 	.set	reorder
 	$SLL	$a0,$t9
@@ -917,7 +914,8 @@ $code.=<<___;
 	$SRL	$DH,$a2,4*$BNSZ	# bits
 	sgeu	$at,$a0,$a2
 	.set	noreorder
-	bnezl	$at,.+8
+	beqz	$at,.+12
+	nop
 	$SUBU	$a0,$a2
 	.set	reorder
 
@@ -1874,6 +1872,41 @@ ___
 
 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
 
+sub add_c2 () {
+my ($hi,$lo,$c0,$c1,$c2,
+    $warm,      # !$warm denotes first call with specific sequence of
+                # $c_[XYZ] when there is no Z-carry to accumulate yet;
+    $an,$bn     # these two are arguments for multiplication which
+                # result is used in *next* step [which is why it's
+                # commented as "forward multiplication" below];
+    )=@_;
+$code.=<<___;
+	mflo	$lo
+	mfhi	$hi
+	$ADDU	$c0,$lo
+	sltu	$at,$c0,$lo
+	 $MULTU	$an,$bn			# forward multiplication
+	$ADDU	$c0,$lo
+	$ADDU	$at,$hi
+	sltu	$lo,$c0,$lo
+	$ADDU	$c1,$at
+	$ADDU	$hi,$lo
+___
+$code.=<<___	if (!$warm);
+	sltu	$c2,$c1,$at
+	$ADDU	$c1,$hi
+	sltu	$hi,$c1,$hi
+	$ADDU	$c2,$hi
+___
+$code.=<<___	if ($warm);
+	sltu	$at,$c1,$at
+	$ADDU	$c1,$hi
+	$ADDU	$c2,$at
+	sltu	$hi,$c1,$hi
+	$ADDU	$c2,$hi
+___
+}
+
 $code.=<<___;
 
 .align	5
@@ -1922,21 +1955,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_1
 	$ADDU	$c_3,$t_2,$at
 	$ST	$c_2,$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@@ -1947,67 +1969,19 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,2*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,3*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@@ -2018,97 +1992,23 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,4*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
-	$ADDU	$c_2,$at
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,5*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_1,$t_1
@@ -2119,112 +2019,25 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_2
 	$ADDU	$c_3,$at
 	$ST	$c_1,6*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
+$code.=<<___;
 	$ST	$c_2,7*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@@ -2235,82 +2048,21 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,8*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,9*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@@ -2321,52 +2073,17 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,10*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,11*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_1,$t_1
@@ -2377,21 +2094,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_2
 	$ADDU	$c_3,$at
 	$ST	$c_1,12*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
+$code.=<<___;
 	$ST	$c_2,13*$BNSZ($a0)
 
 	mflo	$t_1
@@ -2459,21 +2165,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_1
 	$ADDU	$c_3,$t_2,$at
 	$ST	$c_2,$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@@ -2484,52 +2179,17 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,2*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,3*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@@ -2540,21 +2200,10 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,4*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,5*$BNSZ($a0)
 
 	mflo	$t_1
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
index f9b6992ccc..420f4d5807 100644
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -191,7 +191,7 @@ L1st:
 
 	addi	$j,$j,$BNSZ	; j++
 	addi	$tp,$tp,$BNSZ	; tp++
-	bdnz-	L1st
+	bdnz	L1st
 ;L1st
 	addc	$lo0,$alo,$hi0
 	addze	$hi0,$ahi
@@ -253,7 +253,7 @@ Linner:
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	addi	$tp,$tp,$BNSZ	; tp++
-	bdnz-	Linner
+	bdnz	Linner
 ;Linner
 	$LD	$tj,$BNSZ($tp)	; tp[j]
 	addc	$lo0,$alo,$hi0
@@ -276,7 +276,7 @@ Linner:
 	slwi	$tj,$num,`log($BNSZ)/log(2)`
 	$UCMP	$i,$tj
 	addi	$i,$i,$BNSZ
-	ble-	Louter
+	ble	Louter
 
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
@@ -289,7 +289,7 @@ Lsub:	$LDX	$tj,$tp,$j
 	subfe	$aj,$nj,$tj	; tp[j]-np[j]
 	$STX	$aj,$rp,$j
 	addi	$j,$j,$BNSZ
-	bdnz-	Lsub
+	bdnz	Lsub
 
 	li	$j,0
 	mtctr	$num
@@ -304,7 +304,7 @@ Lcopy:				; copy or in-place refresh
 	$STX	$tj,$rp,$j
 	$STX	$j,$tp,$j	; zap at once
 	addi	$j,$j,$BNSZ
-	bdnz-	Lcopy
+	bdnz	Lcopy
 
 	$POP	$tj,0($sp)
 	li	r3,1
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index aaf669a5b3..5e22cd8fc6 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -952,7 +952,7 @@ $data=<<EOF;
 	addze	r11,r0
 					#mul_add_c(a[3],b[2],c3,c1,c2);
 	$LD	r6,`3*$BNSZ`(r4)
-	$LD	r7,`2*$BNSZ`(r4)
+	$LD	r7,`2*$BNSZ`(r5)
 	$UMULL	r8,r6,r7
 	$UMULH	r9,r6,r7
 	addc	r12,r8,r12
@@ -1552,7 +1552,7 @@ Lppcasm_sub_mainloop:
 				# if carry = 1 this is r7-r8. Else it
 				# is r7-r8 -1 as we need.
 	$STU	r6,$BNSZ(r3)
-	bdnz-	Lppcasm_sub_mainloop
+	bdnz	Lppcasm_sub_mainloop
 Lppcasm_sub_adios:	
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
@@ -1598,7 +1598,7 @@ Lppcasm_add_mainloop:
 	$LDU	r8,$BNSZ(r5)
 	adde	r8,r7,r8
 	$STU	r8,$BNSZ(r3)
-	bdnz-	Lppcasm_add_mainloop
+	bdnz	Lppcasm_add_mainloop
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
@@ -1755,7 +1755,7 @@ Lppcasm_sqr_mainloop:
 	$UMULH  r8,r6,r6
 	$STU	r7,$BNSZ(r3)
 	$STU	r8,$BNSZ(r3)
-	bdnz-	Lppcasm_sqr_mainloop
+	bdnz	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
 	.long	0
@@ -1819,7 +1819,7 @@ Lppcasm_mw_LOOP:
 	
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bdnz-	Lppcasm_mw_LOOP
+	bdnz	Lppcasm_mw_LOOP
 
 Lppcasm_mw_REM:
 	andi.	r5,r5,0x3
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
index a14e769ad0..d565859667 100644
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -561,7 +561,7 @@ $code.=<<___;
 	stfd	$T3b,`$FRAME+56`($sp)
 	 std	$t0,8($tp)		; tp[j-1]
 	 stdu	$t4,16($tp)		; tp[j]
-	bdnz-	L1st
+	bdnz	L1st
 
 	fctid	$dota,$dota
 	fctid	$dotb,$dotb
@@ -856,7 +856,7 @@ $code.=<<___;
 	 addze	$carry,$carry
 	 std	$t3,-16($tp)		; tp[j-1]
 	 std	$t5,-8($tp)		; tp[j]
-	bdnz-	Linner
+	bdnz	Linner
 
 	fctid	$dota,$dota
 	fctid	$dotb,$dotb
@@ -954,7 +954,7 @@ Lsub:	ldx	$t0,$tp,$i
 	stdx	$t0,$rp,$i
 	stdx	$t2,$t6,$i
 	addi	$i,$i,16
-	bdnz-	Lsub
+	bdnz	Lsub
 
 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
@@ -981,7 +981,7 @@ Lcopy:				; copy or in-place refresh
 	stdx	$i,$tp,$i	; zap tp at once
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
-	bdnz-	Lcopy
+	bdnz	Lcopy
 ___
 $code.=<<___ if ($SIZE_T==4);
 	subf	$np,$num,$np	; rewind np
@@ -1014,7 +1014,7 @@ Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
 	stw	$t5,8($rp)
 	stw	$t6,12($rp)
 	stwu	$t7,16($rp)
-	bdnz-	Lsub
+	bdnz	Lsub
 
 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
@@ -1046,7 +1046,7 @@ Lcopy:				; copy or in-place refresh
 	stwu	$t3,16($rp)
 	std	$i,8($tp)	; zap tp at once
 	stdu	$i,16($tp)
-	bdnz-	Lcopy
+	bdnz	Lcopy
 ___
 
 $code.=<<___;
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index 86bb0429f3..ad98188e81 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -366,6 +366,10 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 # endif
 #endif /* BN_BITS2 != 64 */
 
+#if defined(_TMS320C6X) && defined(NIST_INT64)
+# undef NIST_INT64     /* compiler bug */
+# pragma diag_suppress 177
+#endif
 
 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
@@ -1047,6 +1051,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	return 1;
 	}
 
+#ifdef _WIN32_WCE
+/* Workaround for compiler bug under CE */
+#pragma optimize( "", off )
+#endif
+
 #define BN_NIST_521_RSHIFT	(521%BN_BITS2)
 #define BN_NIST_521_LSHIFT	(BN_BITS2-BN_NIST_521_RSHIFT)
 #define BN_NIST_521_TOP_MASK	((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
@@ -1113,6 +1122,10 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	return 1;
 	}
 
+#ifdef _WIN32_WCE
+#pragma optimize( "", on )
+#endif
+
 int (*BN_nist_mod_func(const BIGNUM *p))(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
 	{
 	if (BN_ucmp(&_bignum_nist_p_192, p) == 0)
diff --git a/crypto/c64xcpuid.pl b/crypto/c64xcpuid.pl
new file mode 100644
index 0000000000..88fd153b98
--- /dev/null
+++ b/crypto/c64xcpuid.pl
@@ -0,0 +1,326 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	OPENSSL_rdtsc,_OPENSSL_rdtsc
+	.asg	OPENSSL_cleanse,_OPENSSL_cleanse
+	.asg	CRYPTO_memcmp,_CRYPTO_memcmp
+	.asg	OPENSSL_atomic_add,_OPENSSL_atomic_add
+	.asg	OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+	.asg	OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+	.asg	OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+	.endif
+
+	.asg	B3,RA
+	.asg	0x01AC0000,TIMER_BASE	; Timer 2
+
+	.global	_OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+	.asmfunc
+	MVKL	TIMER_BASE,A5
+	MVKH	TIMER_BASE,A5
+	LDW	*A5[0],A2	; load CTL
+	LDW	*A5[2],A4	; load CTN
+	NOP	2
+	.if	.BIG_ENDIAN
+	MVK	0x2c0,A7	; internal clock source, don't hold, go
+||	MVK	-1,A6		; maximum period
+	.else
+	MVK	0x2c0,A6	; internal clock source, don't hold, go
+||	MVK	-1,A7		; maximum period
+	.endif
+  [!A2]	STDW	A7:A6,*A5[0]	; fire it up
+||	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_cleanse
+_OPENSSL_cleanse:
+	.asmfunc
+	ZERO	A3:A2
+||	ZERO	B2
+||	SHRU	B4,3,B0		; is length >= 8
+||	ADD	1,A4,B6
+  [!B0]	BNOP	RA
+|| [B0]	SUB	B0,1,B2
+||	ZERO	A1
+||	ZERO	B1
+   [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	0,B4,A1
+||[!B0]	CMPLT	1,B4,B1
+||	ZERO	B5
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	2,B4,A1
+||[!B0]	CMPLT	3,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	4,B4,A1
+||[!B0]	CMPLT	5,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	6,B4,A1
+   [A1]	STB	A2,*A4++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+
+cleanse_loop?:
+	STNDW	A3:A2,*A4++
+||	SUB	B4,8,B4
+|| [B2]	BDEC	cleanse_loop?,B2
+
+	MV	B4,B0		; remaining bytes
+||	ADD	1,A4,B6
+||	BNOP	RA
+   [B0]	CMPLT	0,B0,A1
+|| [B0]	CMPLT	1,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	2,B0,A1
+|| [B0]	CMPLT	3,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	4,B0,A1
+|| [B0]	CMPLT	5,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	6,B0,A1
+   [A1]	STB	A2,*A4++[2]
+	.endasmfunc
+
+	.if	0
+	.global	_CRYPTO_memcmp
+_CRYPTO_memcmp:
+	.asmfunc
+	MV	A6,B0
+  [!B0]	BNOP	RA
+||[!B0]	ZERO	A4
+|| [B0]	ZERO	A1:A0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+memcmp_loop?:
+	OR	A1,A0,A0
+||	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+	BNOP	RA,3
+	ZERO	A4
+  [A0]	MVK	1,A4
+	.endasmfunc
+	.endif
+
+	.global	_OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+	.asmfunc
+	BNOP	atomic_store?	; pre-C64x+ systems are uni-processor, it's
+||	LDW	*A4,B5		; enough to hold interrupts off through
+				; the load-update-store cycle to achieve
+				; atomicity
+	NOP
+	BNOP	RA,3		; and this branch stretches even over store
+	ADD	B4,B5,B5
+atomic_store?:
+	STW	B5,*A4
+||	MV	B5,A4
+	.endasmfunc
+
+	.global	_OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+	.asmfunc
+	ZERO	A0
+||	ZERO	B0
+||	ZERO	A1
+||	ZERO	B1
+	ZERO	A3:A2
+||	MVD	B0,B2
+||	ZERO	A4
+||	ZERO	B4
+||	ZERO	A5
+||	ZERO	B5
+||	BNOP	RA
+	ZERO	A7:A6
+||	ZERO	B7:B6
+||	ZERO	A8
+||	ZERO	B8
+||	ZERO	A9
+||	ZERO	B9
+	ZERO	A17:A16
+||	ZERO	B17:B16
+||	ZERO	A18
+||	ZERO	B18
+||	ZERO	A19
+||	ZERO	B19
+	ZERO	A21:A20
+||	ZERO	B21:B20
+||	ZERO	A22
+||	ZERO	B22
+||	ZERO	A23
+||	ZERO	B23
+	ZERO	A25:A24
+||	ZERO	B25:B24
+||	ZERO	A26
+||	ZERO	B26
+||	ZERO	A27
+||	ZERO	B27
+	ZERO	A29:A28
+||	ZERO	B29:B28
+||	ZERO	A30
+||	ZERO	B30
+||	ZERO	A31
+||	ZERO	B31
+	.endasmfunc
+
+CLFLUSH	.macro	CONTROL,ADDR,LEN
+	B	passthrough?
+||	STW	ADDR,*CONTROL[0]
+	STW	LEN,*CONTROL[1]
+spinlock?:
+	LDW	*CONTROL[1],A0
+	NOP	3
+passthrough?:
+	NOP
+  [A0]	BNOP	spinlock?,5
+	.endm
+
+	.global	_OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+	.asmfunc
+	MV	B4,B0			; reassign sizeof(output)
+||	MV	A4,B4			; reassign output
+||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
+	MV	B0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+||	MVKH	TIMER_BASE,B16
+	LDW	*B16[2],B8		; collect 1st tick
+||	MVK	0x00004010,A5
+	NOP	4
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4
+bus_loop1?:
+	LDW	*B16[2],B8
+|| [B0]	SUB	B0,1,B0
+	NOP	4
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||	ADDK	4,B4
+|| [B0]	BNOP	bus_loop1?,5
+
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+	.asmfunc
+	MV	A6,B0			; reassign max
+||	MV	B4,A6			; reassing sizeof(output)
+||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
+	MV	A4,B4			; reassign output
+||	MVK	0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+||	MVKH	TIMER_BASE,B16
+
+	LDW	*B16[2],B8		; collect 1st tick
+||	MVK	0x00004010,A5
+	NOP	4
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4
+
+	LDW	*B16[2],B8		; collect 1st diff
+	NOP	4
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+||	SUB	B0,1,B0
+bus_loop2?:
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||[!B0]	BNOP	bus_loop2_done?,2
+||	SUB	B0,1,B0
+	LDW	*B16[2],B8
+	NOP	4
+	SUB	B8,B9,B8
+||	MV	B8,B9
+	CMPEQ	B8,B7,B2
+||	MV	B8,B7
+  [!B2]	ADDAW	B4,1,B4
+||[!B2]	ADDK	1,A4
+	CMPEQ	A4,A6,A2
+  [!A2]	BNOP	bus_loop2?,5
+
+bus_loop2_done?:
+	BNOP	RA,5
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".init_array"
+	.else
+	.sect	".pinit"
+	.endif
+	.align	4
+	.long	_OPENSSL_rdtsc		; auto-start timer
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xpluscpuid.pl
new file mode 100644
index 0000000000..067b693d5c
--- /dev/null
+++ b/crypto/c64xpluscpuid.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+#
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+
+	.global	_OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+	.asmfunc
+	B	RA
+	MVC	TSCL,B0
+	MVC	TSCH,B1
+  [!B0]	MVC	B0,TSCL		; start TSC
+	MV	B0,A4
+	MV	B1,A5
+	.endasmfunc
+
+	.global	_OPENSSL_cleanse
+_OPENSSL_cleanse:
+	.asmfunc
+	ZERO	A3:A2
+||	ZERO	B2
+||	SHRU	B4,3,B0		; is length >= 8
+||	ADD	1,A4,B6
+  [!B0]	BNOP	RA
+||	ZERO	A1
+||	ZERO	B1
+   [B0]	MVC	B0,ILC
+||[!B0]	CMPLT	0,B4,A1
+||[!B0]	CMPLT	1,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	2,B4,A1
+||[!B0]	CMPLT	3,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	4,B4,A1
+||[!B0]	CMPLT	5,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	6,B4,A1
+   [A1]	STB	A2,*A4++[2]
+
+	SPLOOP	1
+	STNDW	A3:A2,*A4++
+||	SUB	B4,8,B4
+	SPKERNEL
+
+	MV	B4,B0		; remaining bytes
+||	ADD	1,A4,B6
+||	BNOP	RA
+   [B0]	CMPLT	0,B0,A1
+|| [B0]	CMPLT	1,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	2,B0,A1
+|| [B0]	CMPLT	3,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	4,B0,A1
+|| [B0]	CMPLT	5,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	6,B0,A1
+   [A1]	STB	A2,*A4++[2]
+	.endasmfunc
+
+	.global	_OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+	.asmfunc
+	MV	A4,B0
+atomic_add?:
+	LL	*B0,B5
+	NOP	4
+	ADD	B4,B5,B5
+	SL	B5,*B0
+	CMTL	*B0,B1
+	NOP	4
+  [!B1]	B	atomic_add?
+   [B1]	BNOP	RA,4
+	MV	B5,A4
+	.endasmfunc
+
+	.global	_OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+	.asmfunc
+	ZERO	A0
+||	ZERO	B0
+||	ZERO	A1
+||	ZERO	B1
+	ZERO	A3:A2
+||	MVD	B0,B2
+||	ZERO	A4
+||	ZERO	B4
+||	ZERO	A5
+||	ZERO	B5
+||	BNOP	RA
+	ZERO	A7:A6
+||	ZERO	B7:B6
+||	ZERO	A8
+||	ZERO	B8
+||	ZERO	A9
+||	ZERO	B9
+	ZERO	A17:A16
+||	ZERO	B17:B16
+||	ZERO	A18
+||	ZERO	B18
+||	ZERO	A19
+||	ZERO	B19
+	ZERO	A21:A20
+||	ZERO	B21:B20
+||	ZERO	A22
+||	ZERO	B22
+||	ZERO	A23
+||	ZERO	B23
+	ZERO	A25:A24
+||	ZERO	B25:B24
+||	ZERO	A26
+||	ZERO	B26
+||	ZERO	A27
+||	ZERO	B27
+	ZERO	A29:A28
+||	ZERO	B29:B28
+||	ZERO	A30
+||	ZERO	B30
+||	ZERO	A31
+||	ZERO	B31
+	.endasmfunc
+
+CLFLUSH	.macro	CONTROL,ADDR,LEN
+	B	passthrough?
+||	STW	ADDR,*CONTROL[0]
+	STW	LEN,*CONTROL[1]
+spinlock?:
+	LDW	*CONTROL[1],A0
+	NOP	3
+passthrough?:
+	NOP
+  [A0]	BNOP	spinlock?,5
+	.endm
+
+	.global	_OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+	.asmfunc
+	MV	B4,B0			; reassign sizeof(output)
+||	MV	A4,B4			; reassign output
+||	MVK	0x00004030,A3
+	MV	B0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+	MVC	TSCL,B8			; collect 1st tick
+||	MVK	0x00004010,A5
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	NOP	4
+	STW	B5,*B4
+bus_loop1?:
+	MVC	TSCL,B8
+|| [B0]	SUB	B0,1,B0
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||	ADDK	4,B4
+|| [B0]	BNOP	bus_loop1?,5
+
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+	.asmfunc
+	MV	A6,B0			; reassign max
+||	MV	B4,A6			; reassing sizeof(output)
+||	MVK	0x00004030,A3
+	MV	A4,B4			; reassign output
+||	MVK	0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+
+	MVC	TSCL,B8			; collect 1st tick
+||	MVK	0x00004010,A5
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	NOP	4
+	STW	B5,*B4
+
+	MVC	TSCL,B8			; collect 1st diff
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+||	SUB	B0,1,B0
+bus_loop2?:
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||[!B0]	BNOP	bus_loop2_done?,2
+||	SUB	B0,1,B0
+	MVC	TSCL,B8
+	SUB	B8,B9,B8
+||	MV	B8,B9
+	CMPEQ	B8,B7,B2
+||	MV	B8,B7
+  [!B2]	ADDAW	B4,1,B4
+||[!B2]	ADDK	1,A4
+	CMPEQ	A4,A6,A2
+  [!A2]	BNOP	bus_loop2?,5
+
+bus_loop2_done?:
+	BNOP	RA,5
+	.endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c
index 5807e30ddd..5ff0fa7028 100644
--- a/crypto/cmac/cmac.c
+++ b/crypto/cmac/cmac.c
@@ -77,19 +77,17 @@ struct CMAC_CTX_st
 
 /* Make temporary keys K1 and K2 */
 
-static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+static void make_kn(unsigned char *k1, const unsigned char *l, int bl)
 	{
 	int i;
+	unsigned char c = l[0], carry = c>>7, cnext;
+
 	/* Shift block to left, including carry */
-	for (i = 0; i < bl; i++)
-		{
-		k1[i] = l[i] << 1;
-		if (i < bl - 1 && l[i + 1] & 0x80)
-			k1[i] |= 1;
-		}
+	for (i = 0; i < bl-1; i++, c = cnext)
+		k1[i] = (c << 1) | ((cnext=l[i+1]) >> 7);
+
 	/* If MSB set fixup with R */
-	if (l[0] & 0x80)
-		k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+	k1[i] = (c << 1) ^ ((0-carry)&(bl==16?0x87:0x1b));
 	}
 
 CMAC_CTX *CMAC_CTX_new(void)
@@ -143,7 +141,8 @@ int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
 int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
 			const EVP_CIPHER *cipher, ENGINE *impl)
 	{
-	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+	__fips_constseg
+	static const unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH] = {0};
 	/* All zeros means restart */
 	if (!key && !cipher && !impl && keylen == 0)
 		{
@@ -152,6 +151,8 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
 			return 0;
 		if (!M_EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
 			return 0;
+		memset(ctx->tbl, 0, M_EVP_CIPHER_CTX_block_size(&ctx->cctx));	
+		ctx->nlast_block = 0;
 		return 1;
 		}
 	/* Initialiase context */
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index 524daf037d..cf96011cc5 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -297,7 +297,7 @@ void OPENSSL_showfatal (const char *fmta,...)
 	DWORD out;
 
 	va_start (ap,fmta);
-	len=_vsnprintf((char *)buf,sizeof(buf),fmt,ap);
+	len=_vsnprintf((char *)buf,sizeof(buf),fmta,ap);
 	WriteFile(h,buf,len<0?sizeof(buf):(DWORD)len,&out,NULL);
 	va_end (ap);
 	return;
@@ -359,7 +359,15 @@ void OPENSSL_showfatal (const char *fmta,...)
 { va_list ap;
 
     va_start (ap,fmta);
+#if defined(OPENSSL_SYS_VXWORKS)
+    {
+	char buf[256];
+	vsnprintf(buf,sizeof(buf),fmta,ap);
+	printf("%s",buf);
+    }
+#else 
     vfprintf (stderr,fmta,ap);
+#endif
     va_end (ap);
 }
 int OPENSSL_isservice (void) { return 0; }
@@ -374,7 +382,9 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
 	abort();
 #else
 	/* Win32 abort() customarily shows a dialog, but we just did that... */
+#ifdef SIGABRT
 	raise(SIGABRT);
+#endif
 	_exit(3);
 #endif
 	}
diff --git a/crypto/des/spr.h b/crypto/des/spr.h
index 9be0dce9f6..35e71a5118 100644
--- a/crypto/des/spr.h
+++ b/crypto/des/spr.h
@@ -56,6 +56,9 @@
  * [including the GNU Public Licence.]
  */
 
+#ifdef _TMS320C6X
+#  pragma DATA_SECTION(DES_SPtrans,".const:des_sptrans")
+#endif
 __fips_constseg
 OPENSSL_GLOBAL const DES_LONG DES_SPtrans[8][64]={
 {
diff --git a/crypto/dsa/dsa.h b/crypto/dsa/dsa.h
index 86766dacb4..408ee11b72 100644
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h
@@ -215,6 +215,11 @@ DSA_SIG * FIPS_dsa_sign_ctx(DSA *dsa, EVP_MD_CTX *ctx);
 int FIPS_dsa_verify_digest(DSA *dsa,
 				const unsigned char *dig, int dlen, DSA_SIG *s);
 int FIPS_dsa_verify_ctx(DSA *dsa, EVP_MD_CTX *ctx, DSA_SIG *s);
+int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, DSA_SIG *s);
+DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash);
+
 #endif
 
 DSA *	DSA_new(void);
diff --git a/crypto/dsa/dsa_gen.c b/crypto/dsa/dsa_gen.c
index d5f4debc92..9e3e57a828 100644
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c
@@ -666,7 +666,13 @@ int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
 			/* "offset = offset + n + 1" */
 
 			/* step 14 */
-			if (counter >= 4096) break;
+			if (counter >= (int)(4 * L)) break;
+			}
+		if (seed_in)
+			{
+			ok = 0;
+			DSAerr(DSA_F_DSA_BUILTIN_PARAMGEN2, DSA_R_INVALID_PARAMETERS);
+			goto err;
 			}
 		}
 end:
diff --git a/crypto/ec/ec2_smpl.c b/crypto/ec/ec2_smpl.c
index f37347b5e1..9a9476f0c1 100644
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c
@@ -556,7 +556,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
 	field_sqr = group->meth->field_sqr;	
 
 	/* only support affine coordinates */
-	if (!point->Z_is_one) goto err;
+	if (!point->Z_is_one) return -1;
 
 	if (ctx == NULL)
 		{
diff --git a/crypto/ec/ec_key.c b/crypto/ec/ec_key.c
index f3331e1ce5..24ae707560 100644
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@@ -511,10 +511,12 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
 								tx, ty, ctx))
 			goto err;
 		}
-	/* Check if retrieved coordinates match originals: if not values
-	 * are out of range.
+	/* Check if retrieved coordinates match originals and are less than
+	 * field order: if not values are out of range.
 	 */
-	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+	if (BN_cmp(x, tx) || BN_cmp(y, ty)
+		|| (BN_cmp(x, &key->group->field) >= 0)
+		|| (BN_cmp(y, &key->group->field) >= 0))
 		{
 		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
 			EC_R_COORDINATES_OUT_OF_RANGE);
diff --git a/crypto/ecdh/ecdh.h b/crypto/ecdh/ecdh.h
index b4b58ee65b..8ac82b8cbd 100644
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h
@@ -85,6 +85,8 @@
 extern "C" {
 #endif
 
+#define EC_FLAG_COFACTOR_ECDH	0x1000
+
 const ECDH_METHOD *ECDH_OpenSSL(void);
 
 void	  ECDH_set_default_method(const ECDH_METHOD *);
diff --git a/crypto/ecdh/ech_ossl.c b/crypto/ecdh/ech_ossl.c
index 94a8f4b696..2656797449 100644
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c
@@ -146,6 +146,18 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 		}
 
 	group = EC_KEY_get0_group(ecdh);
+
+	if (EC_KEY_get_flags(ecdh) & EC_FLAG_COFACTOR_ECDH)
+		{
+		if (!EC_GROUP_get_cofactor(group, x, ctx) ||
+			!BN_mul(x, x, priv_key, ctx))
+			{
+			ECDHerr(ECDH_F_ECDH_COMPUTE_KEY, ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		priv_key = x;
+		}
+
 	if ((tmp=EC_POINT_new(group)) == NULL)
 		{
 		ECDHerr(ECDH_F_ECDH_COMPUTE_KEY,ERR_R_MALLOC_FAILURE);
diff --git a/crypto/ecdsa/ecdsa.h b/crypto/ecdsa/ecdsa.h
index c3275b0839..cd6d19ccde 100644
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h
@@ -236,6 +236,11 @@ ECDSA_SIG * FIPS_ecdsa_sign_ctx(EC_KEY *key, EVP_MD_CTX *ctx);
 int FIPS_ecdsa_verify_digest(EC_KEY *key,
 			const unsigned char *dig, int dlen, ECDSA_SIG *s);
 int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s);
+int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, ECDSA_SIG *s);
+ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key,
+			const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash);
 #endif
 
 
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index 429255d215..6f77e7e4b9 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -89,6 +89,10 @@ typedef struct
 	{
 	AES_KEY ks1, ks2;	/* AES key schedules to use */
 	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
 	} EVP_AES_XTS_CTX;
 
 typedef struct
@@ -123,6 +127,9 @@ void vpaes_cbc_encrypt(const unsigned char *in,
 			unsigned char *ivec, int enc);
 #endif
 #ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
 void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
 			size_t len, const AES_KEY *key,
 			const unsigned char ivec[16]);
@@ -133,6 +140,19 @@ void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
 			const unsigned char ivec[AES_BLOCK_SIZE]);
 #endif
 
+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+extern int OPENSSL_ppccap_P;
+# define HWAES_CAPABLE  (OPENSSL_ppccap_P & (1<<2))
+# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
+# define HWAES_encrypt aes_p8_encrypt
+# define HWAES_decrypt aes_p8_decrypt
+# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+# define HWAES_xts_encrypt aes_p8_xts_encrypt
+# define HWAES_xts_decrypt aes_p8_xts_decrypt
+#endif
+
 #if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
 	((defined(__i386)	|| defined(__i386__)	|| \
 	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
@@ -337,11 +357,13 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			{
 			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
 			}
 		else
 			{
 			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
 			}
 
 		aesni_set_encrypt_key(key + ctx->key_len/2,
@@ -360,32 +382,9 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	return 1;
 	}
 
+#define aesni_xts_cipher aes_xts_cipher
 static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
-		const unsigned char *in, size_t len)
-	{
-	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
-	if (!xctx->xts.key1 || !xctx->xts.key2)
-		return -1;
-	if (!out || !in)
-		return -1;
-#ifdef OPENSSL_FIPS
-	/* Requirement of SP800-38E */
-	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
-			(len > (1L<<20)*16))
-		{
-		EVPerr(EVP_F_AESNI_XTS_CIPHER, EVP_R_TOO_LARGE);
-		return -1;
-		}
-#endif
-	if (ctx->encrypt)
-		aesni_xts_encrypt(in, out, len,
-			xctx->xts.key1, xctx->xts.key2, ctx->iv);
-	else
-		aesni_xts_decrypt(in, out, len,
-			xctx->xts.key1, xctx->xts.key2, ctx->iv);
-
-	return len;
-	}
+		const unsigned char *in, size_t len);
 
 static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                         const unsigned char *iv, int enc)
@@ -485,6 +484,42 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
 { return &aes_##keylen##_##mode; }
 #endif
 
+#if defined(OPENSSL_CPUID_OBJ) && defined(__aarch64__)
+#include "arm_arch.h"
+#if __ARM_ARCH__>=7
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+# define HWAES_encrypt aes_v8_encrypt
+# define HWAES_decrypt aes_v8_decrypt
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+#endif
+#endif
+
+#if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+	const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+	const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+	size_t length, const AES_KEY *key,
+	unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+	size_t len, const AES_KEY *key, const unsigned char ivec[16]);
+void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
+	size_t len, const AES_KEY *key1,
+	const AES_KEY *key2, const unsigned char iv[16]);
+void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out,
+	size_t len, const AES_KEY *key1,
+	const AES_KEY *key2, const unsigned char iv[16]);
+
+#endif
+
 #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
 	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
 	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
@@ -503,6 +538,28 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	mode = ctx->cipher->flags & EVP_CIPH_MODE;
 	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
 	    && !enc)
+#ifdef HWAES_CAPABLE
+	    if (HWAES_CAPABLE)
+		{
+		ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block      = (block128_f)HWAES_decrypt;
+		dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+		if (mode==EVP_CIPH_CBC_MODE)
+		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+#endif
+		}
+	    else
+#endif
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
 #ifdef VPAES_CAPABLE
 	    if (VPAES_CAPABLE)
 		{
@@ -522,6 +579,26 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 					NULL;
 		}
 	else
+#ifdef HWAES_CAPABLE
+	    if (HWAES_CAPABLE)
+		{
+		ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block      = (block128_f)HWAES_encrypt;
+		dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+		if (mode==EVP_CIPH_CBC_MODE)
+		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+		else
+#endif
+#ifdef HWAES_ctr32_encrypt_blocks
+		if (mode==EVP_CIPH_CTR_MODE)
+		    dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+		else
+#endif
+		(void)0;	/* terminate potentially open 'else' */
+		}
+	    else
+#endif
 #ifdef BSAES_CAPABLE
 	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
 		{
@@ -800,6 +877,28 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 		/* Extra padding: tag appended to record */
 		return EVP_GCM_TLS_TAG_LEN;
 
+	case EVP_CTRL_COPY:
+		{
+			EVP_CIPHER_CTX *out = ptr;
+			EVP_AES_GCM_CTX *gctx_out = out->cipher_data;
+			if (gctx->gcm.key)
+				{
+				if (gctx->gcm.key != &gctx->ks)
+					return 0;
+				gctx_out->gcm.key = &gctx_out->ks;
+				}
+			if (gctx->iv == c->iv)
+				gctx_out->iv = out->iv;
+			else
+			{
+				gctx_out->iv = OPENSSL_malloc(gctx->ivlen);
+				if (!gctx_out->iv)
+					return 0;
+				memcpy(gctx_out->iv, gctx->iv, gctx->ivlen);
+			}
+			return 1;
+		}
+
 	default:
 		return -1;
 
@@ -814,6 +913,21 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		return 1;
 	if (key)
 		{ do {
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)HWAES_encrypt);
+#ifdef HWAES_ctr32_encrypt_blocks
+			gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+#else
+			gctx->ctr = NULL;
+#endif
+			break;
+			}
+		else
+#endif
 #ifdef BSAES_CAPABLE
 		if (BSAES_CAPABLE)
 			{
@@ -961,8 +1075,6 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 
 	if (!gctx->iv_set)
 		return -1;
-	if (!ctx->encrypt && gctx->taglen < 0)
-		return -1;
 	if (in)
 		{
 		if (out == NULL)
@@ -1004,6 +1116,8 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		{
 		if (!ctx->encrypt)
 			{
+			if (gctx->taglen < 0)
+				return -1;
 			if (CRYPTO_gcm128_finish(&gctx->gcm,
 					ctx->buf, gctx->taglen) != 0)
 				return -1;
@@ -1021,7 +1135,8 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 
 #define CUSTOM_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 \
 		| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
-		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+		| EVP_CIPH_CUSTOM_COPY)
 
 BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
 		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
@@ -1033,7 +1148,25 @@ BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
 static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 	{
 	EVP_AES_XTS_CTX *xctx = c->cipher_data;
-	if (type != EVP_CTRL_INIT)
+	if (type == EVP_CTRL_COPY)
+		{
+		EVP_CIPHER_CTX *out = ptr;
+		EVP_AES_XTS_CTX *xctx_out = out->cipher_data;
+		if (xctx->xts.key1)
+			{
+			if (xctx->xts.key1 != &xctx->ks1)
+				return 0;
+			xctx_out->xts.key1 = &xctx_out->ks1;
+			}
+		if (xctx->xts.key2)
+			{
+			if (xctx->xts.key2 != &xctx->ks2)
+				return 0;
+			xctx_out->xts.key2 = &xctx_out->ks2;
+			}
+		return 1;
+		}
+	else if (type != EVP_CTRL_INIT)
 		return -1;
 	/* key1 and key2 are used as an indicator both key and IV are set */
 	xctx->xts.key1 = NULL;
@@ -1050,7 +1183,37 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 
 	if (key) do
 		{
+		xctx->stream = NULL;
 		/* key_len is two AES keys */
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			if (enc)
+			    {
+			    HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			    xctx->xts.block1 = (block128_f)HWAES_encrypt;
+#ifdef HWAES_xts_encrypt
+			    xctx->stream = HWAES_xts_encrypt;
+#endif
+			    }
+			else
+			    {
+			    HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			    xctx->xts.block1 = (block128_f)HWAES_decrypt;
+#ifdef HWAES_xts_decrypt
+			    xctx->stream = HWAES_xts_decrypt;
+#endif
+			    }
+
+			HWAES_set_encrypt_key(key + ctx->key_len/2,
+						    ctx->key_len * 4, &xctx->ks2);
+			xctx->xts.block2 = (block128_f)HWAES_encrypt;
+
+			xctx->xts.key1 = &xctx->ks1;
+			break;
+			}
+		else
+#endif
 #ifdef VPAES_CAPABLE
 		if (VPAES_CAPABLE)
 		    {
@@ -1105,28 +1268,32 @@ static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 	{
 	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
 	if (!xctx->xts.key1 || !xctx->xts.key2)
-		return -1;
+		return 0;
 	if (!out || !in)
-		return -1;
+		return 0;
 #ifdef OPENSSL_FIPS
 	/* Requirement of SP800-38E */
 	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
-			(len > (1L<<20)*16))
+			(len > (1UL<<20)*16))
 		{
 		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
-		return -1;
+		return 0;
 		}
 #endif
-	if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
 								ctx->encrypt))
-		return -1;
-	return len;
+		return 0;
+	return 1;
 	}
 
 #define aes_xts_cleanup NULL
 
 #define XTS_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
-			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+			 | EVP_CIPH_CUSTOM_COPY)
 
 BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
 BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
@@ -1176,6 +1343,19 @@ static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 		cctx->len_set = 0;
 		return 1;
 
+	case EVP_CTRL_COPY:
+		{
+			EVP_CIPHER_CTX *out = ptr;
+			EVP_AES_CCM_CTX *cctx_out = out->cipher_data;
+			if (cctx->ccm.key)
+				{
+				if (cctx->ccm.key != &cctx->ks)
+					return 0;
+				cctx_out->ccm.key = &cctx_out->ks;
+				}
+			return 1;
+		}
+
 	default:
 		return -1;
 
@@ -1190,12 +1370,26 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		return 1;
 	if (key) do
 		{
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks);
+
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)HWAES_encrypt);
+			cctx->str = NULL;
+			cctx->key_set = 1;
+			break;
+			}
+		else
+#endif
 #ifdef VPAES_CAPABLE
 		if (VPAES_CAPABLE)
 			{
 			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
 			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
 					&cctx->ks, (block128_f)vpaes_encrypt);
+			cctx->str = NULL;
 			cctx->key_set = 1;
 			break;
 			}
diff --git a/crypto/evp/evp_locl.h b/crypto/evp/evp_locl.h
index 94162d6419..6d1753522a 100644
--- a/crypto/evp/evp_locl.h
+++ b/crypto/evp/evp_locl.h
@@ -75,7 +75,7 @@ static int cname##_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const uns
 	return 1;\
 }
 
-#define EVP_MAXCHUNK ((size_t)1<<(sizeof(long)*8-2))
+#define EVP_MAXCHUNK ((size_t)1<<(sizeof(int)*8-2))
 
 #define BLOCK_CIPHER_func_ofb(cname, cprefix, cbits, kstruct, ksched) \
 static int cname##_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) \
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
index 811969304f..5a170498bf 100644
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@@ -56,11 +56,16 @@ ghash-alpha.s:	asm/ghash-alpha.pl
 	$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
 ghash-parisc.s:	asm/ghash-parisc.pl
 	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S:	asm/ghashv8-armx.pl
+	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
+ghashp8-ppc.s:	asm/ghashp8-ppc.pl
+	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@
 
 # GNU make "catch all"
 ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 
 ghash-armv4.o:	ghash-armv4.S
+ghashv8-armx.o:	ghashv8-armx.S
 
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index d91586ee29..3799b2b559 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -57,8 +57,20 @@
 # *native* byte order on current platform. See gcm128.c for working
 # example...
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $Xi="r0";	# argument block
 $Htbl="r1";
@@ -112,6 +124,11 @@ $code=<<___;
 .text
 .code	32
 
+#ifdef  __APPLE__
+#define ldrplb	ldrbpl
+#define ldrneb	ldrbne
+#endif
+
 .type	rem_4bit,%object
 .align	5
 rem_4bit:
@@ -326,9 +343,9 @@ $code.=<<___;
 .align	4
 gcm_gmult_neon:
 	sub		$Htbl,#16		@ point at H in GCM128_CTX
-	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vld1.64		`&Dhi("$IN")`,[$Xi]!	@ load Xi
 	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vld1.64		`&Dlo("$IN")`,[$Xi]!
 	vshr.u64	$mod,#32
 	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
 	veor		$zero,$zero
@@ -349,9 +366,9 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vld1.64		`&Dhi("$Z")`,[$Xi]!	@ load Xi
 	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vld1.64		`&Dlo("$Z")`,[$Xi]!
 	vshr.u64	$mod,#32
 	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
 	veor		$zero,$zero
@@ -410,8 +427,8 @@ gcm_ghash_neon:
 	vrev64.8	$Z,$Z
 #endif
 	sub		$Xi,#16	
-	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
-	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+	vst1.64		`&Dhi("$Z")`,[$Xi]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi]
 
 	bx	lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl
new file mode 100644
index 0000000000..1ac4d927d0
--- /dev/null
+++ b/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,231 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
+
+($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
+			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
+			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");		# $rem zaps $Htable
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+
+	.if	0
+	.global	_gcm_gmult_1bit
+_gcm_gmult_1bit:
+	ADDAD	$Htable,2,$Htable
+	.endif
+	.global	_gcm_gmult_4bit
+_gcm_gmult_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+||	LDBU	*++${xip}[15],$x1	; Xi[15]
+	MVK	0xFF,$FF000000
+||	LDBU	*--${xip},$x0		; Xi[14]
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	BNOP	ghash_loop?
+||	MVK	1,B0			; take a single spin
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+||	ZERO	$Z1:$Z0
+	SHRU2	$xia,8,$H01u
+||	ZERO	$Z3:$Z2
+	.endasmfunc
+
+	.global	_gcm_ghash_4bit
+_gcm_ghash_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+||	SHRU	$len,4,B0		; reassign len
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
+	MVK	0xFF,$FF000000
+|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+||	LDDW	*${xip}[1],$Z1:$Z0
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	LDDW	*${xip}[0],$Z3:$Z2
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+	SHRU2	$xia,8,$H01u
+
+|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+	.if	.LITTLE_ENDIAN
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+
+ghash_loop?:
+	SPLOOPD	6			; 6*16+7
+||	MVC	B1,ILC
+|| [B0]	SUB	B0,1,B0
+||	ZERO	A0
+||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
+||	SHL	$x1,1,$xia
+___
+
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+	XORMPY	$H0,$xia,$H0x		; 0	; H�Xi[i]
+||	XORMPY	$H01u,$xib,$H01y
+|| [A0]	LDBU	*--${xip},$x0
+	XORMPY	$H1,$xia,$H1x		; 1
+	XORMPY	$H2,$xia,$H2x		; 2
+||	XORMPY	$H2u,$xib,$H2y
+	XORMPY	$H3,$xia,$H3x		; 3
+||	XORMPY	$H3u,$xib,$H3y
+||[!A0]	MVK.D	15,A0				; *--${xip} counter
+	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H�Xi[i]
+|| [A0]	SUB.S	A0,1,A0
+	XOR.L	$H1x,$Z1,$Z1		; 5
+||	AND.D	$H01y,$FF000000,$H0z
+||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
+||	SHL	$x0,1,$xib
+||	SHL	$x0,1,$xia
+
+	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
+||	SHL	$Z0,1,$rem		;	; rem=Z<<1
+||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
+||	AND.L	$H1y,$FF000000,$H1z
+	XOR.L	$H3x,$Z3,$Z3		; 7/1
+||	SHRMB.S	$Z2,$Z1,$Z1
+||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
+||	AND.S	$H2y,$FF000000,$H2z
+||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
+	XOR.L	$H1z,$Z1,$Z1		; 8/2
+||	SHRMB.S	$Z3,$Z2,$Z2
+||	AND.S	$H3y,$FF000000,$H3z
+	XOR.L	$H2z,$Z2,$Z2		; 9/3
+||	SHRU	$Z3,8,$Z3
+	XOR.D	$H3z,$Z3,$Z3		; 10/4
+	NOP				; 11/5
+
+	SPKERNEL 0,2
+||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
+
+	; input pre-fetch is possible where D1 slot is available...
+   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
+   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
+	NOP				; 10/-
+	.if	.LITTLE_ENDIAN
+	SWAP2	$Z0,$Z1			; 11/-
+||	SWAP4	$Z1,$Z0
+	SWAP4	$Z1,$Z1			; 12/-
+||	SWAP2	$Z0,$Z0
+	SWAP2	$Z2,$Z3
+||	SWAP4	$Z3,$Z2
+||[!B0]	BNOP	RA
+	SWAP4	$Z3,$Z3
+||	SWAP2	$Z2,$Z2
+|| [B0]	BNOP	ghash_loop?
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+  [!B0]	BNOP	RA			; 11/-
+   [B0]	BNOP	ghash_loop?		; 12/-
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+	.endasmfunc
+
+	.sect	.const
+	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl
new file mode 100755
index 0000000000..82bf125eb1
--- /dev/null
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,663 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$IN,$H,$t1		# twisted H
+
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	li		r8,0x40
+	stvx_u		$H, r9,r3
+	li		r9,0x50
+	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+___
+}
+$code.=<<___;
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subic.		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x
+
+.align	5
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
+	 add		$inp,$inp,r0
+
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+
+Leven:
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
new file mode 100644
index 0000000000..300e8d56cb
--- /dev/null
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,376 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# Current performance in cycles per processed byte:
+#
+#		PMULL[2]	32-bit NEON(*)
+# Apple A7	0.92		5.62
+# Cortex-A53	1.01		8.39
+# Cortex-A57	1.17		7.61
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	vld1.64		{$t1},[x1]		@ load H
+	vmov.i8		$xC2,#0xe1
+	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+	vext.8		$IN,$t1,$t1,#8
+	vshr.u64	$t2,$xC2,#63
+	vdup.32		$t1,${t1}[1]
+	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
+	vshr.u64	$t2,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t2,$t2,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t2,$t2,$t2,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t2		@ H<<<=1
+	veor		$H,$IN,$t0		@ twisted H
+	vst1.64		{$H},[x0],#16
+
+	@ calculate H^2
+	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
+	vpmull.p64	$Xl,$H,$H
+	veor		$t0,$t0,$H
+	vpmull2.p64	$Xh,$H,$H
+	vpmull.p64	$Xm,$t0,$t0
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$H2,$Xl,$t2
+
+	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
+	veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$Hhl-$H2},[x0]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vshl.u64	$xC2,$xC2,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$IN,$t1,$t1,#8
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo�Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi�Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)�(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vstmdb		sp!,{d8-d15}
+___
+$code.=<<___;
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	subs		$len,$len,#32
+	vmov.i8		$xC2,#0xe1
+	mov		$inc,#16
+	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
+	vld1.64		{$H2},[$Htbl]
+	cclr		$inc,eq
+	vext.8		$Xl,$Xl,$Xl,#8
+	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
+	vshl.u64	$xC2,$xC2,#57		@ 0xc2.0
+#ifndef __ARMEB__
+	vrev64.8	$t0,$t0
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$IN,$t0,$t0,#8
+	b.lo		.Lodd_tail_v8
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+$code.=<<___;
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$In,$t1,$t1,#8
+	veor		$IN,$IN,$Xl		@ I[i]^=Xi
+	vpmull.p64	$Xln,$H,$In		@ H�Ii+1
+	veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	vpmull2.p64	$Xhn,$H,$In
+	b		.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8		$t2,$IN,$IN,#8
+	subs		$len,$len,#32
+	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo�Xi.lo
+	cclr		$inc,lo
+
+	 vpmull.p64	$Xmn,$Hhl,$t1
+	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi�Xi.hi
+	veor		$Xl,$Xl,$Xln		@ accumulate
+	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)�(Xi.lo+Xi.hi)
+	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i]
+
+	veor		$Xh,$Xh,$Xhn
+	 cclr		$inc,eq
+	veor		$Xm,$Xm,$Xmn
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+1]
+#ifndef __ARMEB__
+	 vrev64.8	$t0,$t0
+#endif
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$In,$t1,$t1,#8
+	 vext.8		$IN,$t0,$t0,#8
+	veor		$Xl,$Xm,$t2
+	 vpmull.p64	$Xln,$H,$In		@ H�Ii+1
+	veor		$IN,$IN,$Xh		@ accumulate $IN early
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$IN,$IN,$t2
+	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	veor		$IN,$IN,$Xl
+	 vpmull2.p64	$Xhn,$H,$In
+	b.hs		.Loop_mod2x_v8
+
+	veor		$Xh,$Xh,$t2
+	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
+	adds		$len,$len,#32
+	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
+	b.eq		.Ldone_v8
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo�Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi�Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)�(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index e638e42be8..a46ec61135 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -645,7 +645,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 
 #endif
 
-#if	TABLE_BITS==4 && defined(GHASH_ASM)
+#if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
 # if	!defined(I386_ONLY) && \
 	(defined(__i386)	|| defined(__i386__)	|| \
 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
@@ -666,14 +666,33 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 #  include "arm_arch.h"
 #  if __ARM_ARCH__>=7
 #   define GHASH_ASM_ARM
 #   define GCM_FUNCREF_4BIT
+#   if defined(__aarch64__)
+#    define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL)
+#   endif
+#   if defined(__arm__) || defined(__arm)
+#    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
+#   endif
 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+#  define GHASH_ASM_PPC
+#  define GCM_FUNCREF_4BIT
+extern int OPENSSL_ppccap_P;
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                  size_t len);
+# elif defined(_TMS320C6400_PLUS)
+#   define GHASH_ASM_C64Xplus
 # endif
 #endif
 
@@ -738,14 +757,38 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 	ctx->ghash = gcm_ghash_4bit;
 #  endif
 # elif	defined(GHASH_ASM_ARM)
-	if (OPENSSL_armcap_P & ARMV7_NEON) {
+#  ifdef PMULL_CAPABLE
+	if (PMULL_CAPABLE) {
+		gcm_init_v8(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_v8;
+		ctx->ghash = gcm_ghash_v8;
+	} else
+#  endif
+#  ifdef NEON_CAPABLE
+	if (NEON_CAPABLE) {
 		ctx->gmult = gcm_gmult_neon;
 		ctx->ghash = gcm_ghash_neon;
-	} else {
+	} else
+#  endif
+	{
 		gcm_init_4bit(ctx->Htable,ctx->H.u);
 		ctx->gmult = gcm_gmult_4bit;
 		ctx->ghash = gcm_ghash_4bit;
 	}
+# elif defined(GHASH_ASM_PPC)
+	if (OPENSSL_ppccap_P & (1<<2)) {
+		gcm_init_p8(ctx->Htable, ctx->H.u);
+		ctx->gmult = gcm_gmult_p8;
+		ctx->ghash = gcm_ghash_p8;
+	} else {
+		gcm_init_4bit(ctx->Htable, ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# elif defined(GHASH_ASM_C64Xplus)
+	/* C64x+ assembler doesn't use tables, skip gcm_init_4bit.
+	 * This is likely to trigger "function never referenced"
+	 * warning and code being eliminated. */
 # else
 	gcm_init_4bit(ctx->Htable,ctx->H.u);
 # endif
@@ -1397,7 +1440,7 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
 #endif
 
-	if (ctx->mres)
+	if (ctx->mres || ctx->ares)
 		GCM_MUL(ctx,Xi);
 
 	if (is_endian.little) {
diff --git a/crypto/modes/modes_lcl.h b/crypto/modes/modes_lcl.h
index 4dab6a67fe..fa5d3b02f6 100644
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@@ -29,10 +29,7 @@ typedef unsigned char u8;
 #if defined(__i386)	|| defined(__i386__)	|| \
     defined(__x86_64)	|| defined(__x86_64__)	|| \
     defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)	|| \
-    ( (defined(__arm__)	|| defined(__arm)) && \
-      (defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__) || \
-       defined(__ARM_ARCH_7R__)	|| defined(__ARM_ARCH_7M__)) )
+    defined(__s390__)	|| defined(__s390x__)
 # undef STRICT_ALIGNMENT
 #endif
 
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl
new file mode 100644
index 0000000000..22dc7e4ecc
--- /dev/null
+++ b/crypto/perlasm/arm-xlate.pl
@@ -0,0 +1,165 @@
+#!/usr/bin/env perl
+
+# ARM assembler distiller by <appro>.
+
+my $flavour = shift;
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+$flavour = "linux32" if (!$flavour or $flavour eq "void");
+
+my %GLOBALS;
+my $dotinlocallabels=($flavour=~/linux/)?1:0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub {
+    if ($flavour =~ /linux/)	{ ".arch\t".join(',',@_); }
+    else			{ ""; }
+};
+my $fpu = sub {
+    if ($flavour =~ /linux/)	{ ".fpu\t".join(',',@_); }
+    else			{ ""; }
+};
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+    else			{ ".hidden\t".join(',',@_); }
+};
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0";
+	$name = "_$name";
+    } else			{ $ret = ".comm\t".join(',',@args); }
+
+    $$global = $name;
+    $ret;
+};
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name";
+				last;
+			      };
+    }
+
+    $ret = ".globl	$name" if (!$ret);
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+my $extern = sub {
+    &$globl(@_);
+    return;	# return nothing
+};
+my $type = sub {
+    if ($flavour =~ /linux/)	{ ".type\t".join(',',@_); }
+    else			{ ""; }
+};
+my $size = sub {
+    if ($flavour =~ /linux/)	{ ".size\t".join(',',@_); }
+    else			{ ""; }
+};
+my $inst = sub {
+    if ($flavour =~ /linux/)    { ".inst\t".join(',',@_); }
+    else                        { ".long\t".join(',',@_); }
+};
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";	}
+    else
+    {	"";	}
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    return $line;
+}
+
+while($line=<>) {
+
+    if ($line =~ m/^\s*(#|@|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    printf "%s:",($GLOBALS{$label} or $label);
+	}
+    }
+
+    if ($line !~ m/^[#@]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+		$line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+		$line = $c.$mnemonic;
+		$line.= "\t$arg" if ($arg);
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+close STDOUT;
diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
index a3edd982b6..0f46cf06bc 100755
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@@ -27,7 +27,8 @@ my $globl = sub {
 	/osx/		&& do { $name = "_$name";
 				last;
 			      };
-	/linux.*32/	&& do {	$ret .= ".globl	$name\n";
+	/linux.*(32|64le)/
+			&& do {	$ret .= ".globl	$name\n";
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
@@ -37,7 +38,6 @@ my $globl = sub {
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
-				$ret .= ".size	$name,24\n";
 				$ret .= ".previous\n";
 
 				$name = ".$name";
@@ -50,7 +50,9 @@ my $globl = sub {
     $ret;
 };
 my $text = sub {
-    ($flavour =~ /aix/) ? ".csect" : ".text";
+    my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
+    $ret = ".abiversion	2\n".$ret	if ($flavour =~ /linux.*64le/);
+    $ret;
 };
 my $machine = sub {
     my $junk = shift;
@@ -62,9 +64,12 @@ my $machine = sub {
     ".machine	$arch";
 };
 my $size = sub {
-    if ($flavour =~ /linux.*32/)
+    if ($flavour =~ /linux/)
     {	shift;
-	".size	" . join(",",@_);
+	my $name = shift; $name =~ s|^[\.\_]||;
+	my $ret  = ".size	$name,.-".($flavour=~/64$/?".":"").$name;
+	$ret .= "\n.size	.$name,.-.$name" if ($flavour=~/64$/);
+	$ret;
     }
     else
     {	"";	}
@@ -77,6 +82,25 @@ my $asciz = sub {
     else
     {	"";	}
 };
+my $quad = sub {
+    shift;
+    my @ret;
+    my ($hi,$lo);
+    for (@_) {
+	if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+	{  $hi=$1?"0x$1":"0"; $lo="0x$2";  }
+	elsif (/^([0-9]+)$/o)
+	{  $hi=$1>>32; $lo=$1&0xffffffff;  } # error-prone with 32-bit perl
+	else
+	{  $hi=undef; $lo=$_; }
+
+	if (defined($hi))
+	{  push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo");  }
+	else
+	{  push(@ret,".quad	$lo");  }
+    }
+    join("\n",@ret);
+};
 
 ################################################################
 # simplified mnemonics not handled by at least one assembler
@@ -122,6 +146,66 @@ my $extrdi = sub {
     $b = ($b+$n)&63; $n = 64-$n;
     "	rldicl	$ra,$rs,$b,$n";
 };
+my $vmr = sub {
+    my ($f,$vx,$vy) = @_;
+    "	vor	$vx,$vy,$vy";
+};
+
+# Some ABIs specify vrsave, special-purpose register #256, as reserved
+# for system use.
+my $no_vrsave = ($flavour =~ /aix|linux64le/);
+my $mtspr = sub {
+    my ($f,$idx,$ra) = @_;
+    if ($idx == 256 && $no_vrsave) {
+	"	or	$ra,$ra,$ra";
+    } else {
+	"	mtspr	$idx,$ra";
+    }
+};
+my $mfspr = sub {
+    my ($f,$rd,$idx) = @_;
+    if ($idx == 256 && $no_vrsave) {
+	"	li	$rd,-1";
+    } else {
+	"	mfspr	$rd,$idx";
+    }
+};
+
+# PowerISA 2.06 stuff
+sub vsxmem_op {
+    my ($f, $vrt, $ra, $rb, $op) = @_;
+    "	.long	".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
+}
+# made-up unaligned memory reference AltiVec/VMX instructions
+my $lvx_u	= sub {	vsxmem_op(@_, 844); };	# lxvd2x
+my $stvx_u	= sub {	vsxmem_op(@_, 972); };	# stxvd2x
+my $lvdx_u	= sub {	vsxmem_op(@_, 588); };	# lxsdx
+my $stvdx_u	= sub {	vsxmem_op(@_, 716); };	# stxsdx
+my $lvx_4w	= sub { vsxmem_op(@_, 780); };	# lxvw4x
+my $stvx_4w	= sub { vsxmem_op(@_, 908); };	# stxvw4x
+
+# PowerISA 2.07 stuff
+sub vcrypto_op {
+    my ($f, $vrt, $vra, $vrb, $op) = @_;
+    "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
+}
+my $vcipher	= sub { vcrypto_op(@_, 1288); };
+my $vcipherlast	= sub { vcrypto_op(@_, 1289); };
+my $vncipher	= sub { vcrypto_op(@_, 1352); };
+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
+my $vsbox	= sub { vcrypto_op(@_, 0, 1480); };
+my $vshasigmad	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
+my $vshasigmaw	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
+my $vpmsumb	= sub { vcrypto_op(@_, 1032); };
+my $vpmsumd	= sub { vcrypto_op(@_, 1224); };
+my $vpmsubh	= sub { vcrypto_op(@_, 1096); };
+my $vpmsumw	= sub { vcrypto_op(@_, 1160); };
+my $vaddudm	= sub { vcrypto_op(@_, 192);  };
+
+my $mtsle	= sub {
+    my ($f, $arg) = @_;
+    "	.long	".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
+};
 
 while($line=<>) {
 
@@ -138,7 +222,10 @@ while($line=<>) {
     {
 	$line =~ s|(^[\.\w]+)\:\s*||;
 	my $label = $1;
-	printf "%s:",($GLOBALS{$label} or $label) if ($label);
+	if ($label) {
+	    printf "%s:",($GLOBALS{$label} or $label);
+	    printf "\n.localentry\t$GLOBALS{$label},0"	if ($GLOBALS{$label} && $flavour =~ /linux.*64le/);
+	}
     }
 
     {
@@ -147,7 +234,7 @@ while($line=<>) {
 	my $mnemonic = $2;
 	my $f = $3;
 	my $opcode = eval("\$$mnemonic");
-	$line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/);
+	$line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
 	if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
 	elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; }
     }
diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl
index d0b7ae27ae..263182b985 100644
--- a/crypto/perlasm/x86gas.pl
+++ b/crypto/perlasm/x86gas.pl
@@ -45,10 +45,8 @@ sub ::generic
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
diff --git a/crypto/ppccap.c b/crypto/ppccap.c
index ab89ccaa12..675630e41b 100644
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@@ -3,13 +3,24 @@
 #include <string.h>
 #include <setjmp.h>
 #include <signal.h>
+#include <unistd.h>
+#if defined(__linux) || defined(_AIX)
+# include <sys/utsname.h>
+#endif
+#if defined(_AIX53)     /* defined even on post-5.3 */
+# include <sys/systemcfg.h>
+# if !defined(__power_set)
+#  define __power_set(a) (_system_configuration.implementation & (a))
+# endif
+#endif
 #include <crypto.h>
 #include <openssl/bn.h>
 
 #define PPC_FPU64	(1<<0)
 #define PPC_ALTIVEC	(1<<1)
+#define PPC_CRYPTO207	(1<<2)
 
-static int OPENSSL_ppccap_P = 0;
+int OPENSSL_ppccap_P = 0;
 
 static sigset_t all_masked;
 
@@ -49,10 +60,28 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
 	}
 #endif
 
+void sha256_block_p8(void *ctx, const void *inp, size_t len);
+void sha256_block_ppc(void *ctx, const void *inp, size_t len);
+void sha256_block_data_order(void *ctx, const void *inp, size_t len)
+{
+    OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) :
+        sha256_block_ppc(ctx, inp, len);
+}
+
+void sha512_block_p8(void *ctx, const void *inp, size_t len);
+void sha512_block_ppc(void *ctx, const void *inp, size_t len);
+void sha512_block_data_order(void *ctx, const void *inp, size_t len)
+{
+    OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) :
+        sha512_block_ppc(ctx, inp, len);
+}
+
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
 
 void OPENSSL_ppc64_probe(void);
+void OPENSSL_altivec_probe(void);
+void OPENSSL_crypto207_probe(void);
 
 void OPENSSL_cpuid_setup(void)
 	{
@@ -82,6 +111,45 @@ void OPENSSL_cpuid_setup(void)
 
 	OPENSSL_ppccap_P = 0;
 
+#if defined(_AIX)
+	if (sizeof(size_t) == 4) {
+		struct utsname uts;
+# if defined(_SC_AIX_KERNEL_BITMODE)
+		if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64)
+			return;
+# endif
+		if (uname(&uts) != 0 || atoi(uts.version) < 6)
+			return;
+	}
+
+# if defined(__power_set)
+	/*
+	 * Value used in __power_set is a single-bit 1<<n one denoting
+	 * specific processor class. Incidentally 0xffffffff<<n can be
+	 * used to denote specific processor and its successors.
+	 */
+	if (sizeof(size_t) == 4) {
+		/* In 32-bit case PPC_FPU64 is always fastest [if option] */
+		if (__power_set(0xffffffffU<<13))       /* POWER5 and later */
+			OPENSSL_ppccap_P |= PPC_FPU64;
+	} else {
+		/* In 64-bit case PPC_FPU64 is fastest only on POWER6 */
+#  if 0		/* to keep compatibility with previous validations */
+		if (__power_set(0x1U<<14))              /* POWER6 */
+			OPENSSL_ppccap_P |= PPC_FPU64;
+#  endif
+	}
+
+	if (__power_set(0xffffffffU<<14))           /* POWER6 and later */
+		OPENSSL_ppccap_P |= PPC_ALTIVEC;
+
+	if (__power_set(0xffffffffU<<16))           /* POWER8 and later */
+		OPENSSL_ppccap_P |= PPC_CRYPTO207;
+
+	return;
+# endif
+#endif
+
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
 	ill_act.sa_mask    = all_masked;
@@ -108,6 +176,11 @@ void OPENSSL_cpuid_setup(void)
 		{
 		OPENSSL_altivec_probe();
 		OPENSSL_ppccap_P |= PPC_ALTIVEC;
+		if (sigsetjmp(ill_jmp, 1) == 0)
+			{
+			OPENSSL_crypto207_probe();
+			OPENSSL_ppccap_P |= PPC_CRYPTO207;
+			}
 		}
 
 	sigaction (SIGILL,&ill_oact,NULL);
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 3bdfff39d8..00b4b0913a 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -40,6 +40,16 @@ $code=<<___;
 	.long	0
 	.byte	0,12,0x14,0,0,0,0,0
 
+.globl	.OPENSSL_crypto207_probe
+.align	4
+.OPENSSL_crypto207_probe:
+	.long	0x7C000E99	# lvx_u		v0,0,r1
+	.long	0x10000508	# vcipher	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+.size	.OPENSSL_crypto207_probe,.-.OPENSSL_crypto207_probe
+
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
diff --git a/crypto/rsa/rsa_eay.c b/crypto/rsa/rsa_eay.c
index 16f000ff48..64c23f7cdb 100644
--- a/crypto/rsa/rsa_eay.c
+++ b/crypto/rsa/rsa_eay.c
@@ -494,7 +494,7 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
 	if (padding == RSA_X931_PADDING)
 		{
 		BN_sub(f, rsa->n, ret);
-		if (BN_cmp(ret, f))
+		if (BN_cmp(ret, f) > 0)
 			res = f;
 		else
 			res = ret;
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
index b1582f2cff..78c9315c31 100644
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@@ -73,6 +73,8 @@ sha512-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAG
 sha1-ppc.s:	asm/sha1-ppc.pl;	$(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
 sha256-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
 sha512-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
+sha256p8-ppc.s:	asm/sha512p8-ppc.pl;	$(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
+sha512p8-ppc.s:	asm/sha512p8-ppc.pl;	$(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
 
 sha1-parisc.s:	asm/sha1-parisc.pl;	$(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@
 sha256-parisc.s:asm/sha512-parisc.pl;	$(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
@@ -90,6 +92,9 @@ sha512-%.S:	asm/sha512-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 sha1-armv4-large.o:	sha1-armv4-large.S
 sha256-armv4.o:		sha256-armv4.S
 sha512-armv4.o:		sha512-armv4.S
+sha1-armv8.o:		sha1-armv8.S
+sha256-armv8.o:		sha256-armv8.S
+sha512-armv8.o:		sha512-armv8.S
 
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index fe8207f77f..6c0adb9911 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -52,8 +52,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 10%
 # improvement on Cortex A8 core and 12.2 cycles per byte.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";
 $inp="r1";
@@ -177,6 +189,7 @@ for($i=0;$i<5;$i++) {
 $code.=<<___;
 	teq	$Xi,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
 ___
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 	&BODY_16_19(@V);	unshift(@V,pop(@V));
@@ -186,7 +199,6 @@ ___
 $code.=<<___;
 
 	ldr	$K,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#25*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 ___
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
new file mode 100644
index 0000000000..6be8624342
--- /dev/null
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -0,0 +1,343 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		hardware-assisted	software(*)
+# Apple A7	2.31			4.13 (+14%)
+# Cortex-A53	2.19			8.73 (+108%)
+# Cortex-A57	2.35			7.88 (+74%)
+#
+# (*)	Software results are presented mostly for reference purposes.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+	lsr	@Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef	__ARMEB__
+	ror	@Xx[$i+1],@Xx[$i+1],#32
+#else
+	rev32	@Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+	movz	$K,#0xeba1
+	movk	$K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+	movz	$K,#0xc1d6
+	movk	$K,#0xca62,lsl#16
+___
+$code.=<<___;
+	orr	$t0,$b,$c
+	and	$t1,$b,$c
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	ror	$t2,$a,#27
+	and	$t0,$t0,$d
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$e,$e,$t2		// e+=rot(a,5)
+	orr	$t0,$t0,$t1
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+	movz	$K,#0xbcdc
+	movk	$K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+	ldp	@Xw[1],@Xw[2],[$ctx]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+	ldp	@Xw[3],@Xw[4],[$ctx,#8]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	ldr	@Xw[5],[$ctx,#16]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,%function
+.align	6
+sha1_block_data_order:
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA1
+	b.ne	.Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldr	$E,[$ctx,#16]
+
+.Loop:
+	ldr	@Xx[0],[$inp],#64
+	movz	$K,#0x7999
+	sub	$num,$num,#1
+	movk	$K,#0x5a82,lsl#16
+#ifdef	__ARMEB__
+	ror	$Xx[0],@Xx[0],#32
+#else
+	rev32	@Xx[0],@Xx[0]
+#endif
+	add	$E,$E,$K		// warm it up
+	add	$E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	add	$B,$B,@Xw[2]
+	add	$C,$C,@Xw[3]
+	add	$A,$A,@Xw[1]
+	add	$D,$D,@Xw[4]
+	add	$E,$E,@Xw[5]
+	stp	$A,$B,[$ctx]
+	stp	$C,$D,[$ctx,#8]
+	str	$E,[$ctx,#16]
+	cbnz	$num,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.type	sha1_block_armv8,%function
+.align	6
+sha1_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adr	x4,.Lconst
+	eor	$E,$E,$E
+	ld1.32	{$ABCD},[$ctx],#16
+	ld1.32	{$E}[0],[$ctx]
+	sub	$ctx,$ctx,#16
+	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
+	sub	$num,$num,#1
+	rev32	@MSG[0],@MSG[0]
+	rev32	@MSG[1],@MSG[1]
+
+	add.i32	$W0,@Kxx[0],@MSG[0]
+	rev32	@MSG[2],@MSG[2]
+	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
+
+	add.i32	$W1,@Kxx[0],@MSG[1]
+	rev32	@MSG[3],@MSG[3]
+	sha1h	$E1,$ABCD
+	sha1c	$ABCD,$E,$W0		// 0
+	add.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0	@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1$f	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1	@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0	@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1p	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h	$E1,$ABCD		// 18
+	sha1p	$ABCD,$E0,$W0
+
+	sha1h	$E0,$ABCD		// 19
+	sha1p	$ABCD,$E1,$W1
+
+	add.i32	$E,$E,$E0
+	add.i32	$ABCD,$ABCD,$ABCD_SAVE
+
+	cbnz	$num,.Loop_hw
+
+	st1.32	{$ABCD},[$ctx],#16
+	st1.32	{$E}[0],[$ctx]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_armv8,.-sha1_block_armv8
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.LOPENSSL_armcap_P:
+.quad	OPENSSL_armcap_P-.
+.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+.comm	OPENSSL_armcap_P,4,4
+___
+}}}
+
+{   my	%opcode = (
+	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
+	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
+	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64x-large.pl b/crypto/sha/asm/sha1-c64x-large.pl
new file mode 100644
index 0000000000..3916ff3a3f
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+@V = ($A,$B,$C,$D,$E);
+@X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 LDNW	*${INP}++,@X[$i+2]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	 SWAP2	@X[$i+1],@X[$i+1]
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1],@X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 SWAP2	@X[$i+1],@X[$i+1]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1],@X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==15);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i>15);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<79);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+$code.=<<___				if ($i==79);
+|| [A0]	B	loop?
+|| [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+|| [A0]	LDNW	*${INP}++,@X[1]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+||	ADD	$Bctx,$a,$a		; accumulate context
+||	ADD	$Cctx,$b,$b
+	ADD	$Dctx,$c,$c
+||	ADD	$Ectx,$d,$d
+||	ADD	$Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	AND	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	XOR	$F0,$F,$F
+||	AND	$c,$d,$F0
+||	ROTL	$b,30,$b
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc
+	MV	$NUM,A0			; reassign $NUM
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	LDW	*${CTX}[0],$A		; load A-E...
+   [A0]	LDW	*${CTX}[1],$B
+   [A0]	LDW	*${CTX}[2],$C
+   [A0]	LDW	*${CTX}[3],$D
+   [A0]	LDW	*${CTX}[4],$E
+   [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
+   [A0]	LDNW	*${INP}++,@X[1]
+	NOP	3
+
+loop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MVD	$B,$Bctx
+||	SWAP2	@X[0],@X[0]
+||	MVKL	0x5a827999,$K
+	MVKH	0x5a827999,$K		; K_00_19
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MVD	$E,$Ectx
+||	SWAP4	@X[0],@X[0]
+___
+for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x6ed9eba1,$K
+	MVKH	0x6ed9eba1,$K		; K_20_39
+___
+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x8f1bbcdc,$K
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+___
+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0xca62c1d6,$K
+	MVKH	0xca62c1d6,$K		; K_60_79
+___
+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	BNOP	RA			; return
+	STW	$A,*${CTX}[0]		; emit A-E...
+	STW	$B,*${CTX}[1]
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64x.pl b/crypto/sha/asm/sha1-c64x.pl
new file mode 100644
index 0000000000..d7a9dd1d05
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64x.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5");			# X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0			; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	LDW	*${CTX}[0],$A		; load A-E...
+|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
+   [A0]	LDW	*${CTX}[1],$B
+|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
+   [A0]	LDW	*${CTX}[2],$C
+|| [A0]	MVK	0x00404,B0
+   [A0]	LDW	*${CTX}[3],$D
+|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
+   [A0]	LDW	*${CTX}[4],$E
+|| [A0]	MVC	B0,AMR			; setup circular addressing
+	LDNW	*${INP}++,$TX1		; pre-fetch input
+	NOP	1
+
+loop?:
+	MVKL	0x5a827999,$K
+||	ADDAW	SP,2,$XPB
+||	SUB	A0,1,A0
+	MVKH	0x5a827999,$K		; K_00_19
+||	MV	$A,$Actx
+||	MV	$B,$Bctx
+;;==================================================
+	B	body_00_13?		; BODY_00_13
+||	MVK	11,B0
+||	MV	$XPB,$XPA
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MVD	$E,$Ectx
+
+body_00_13?:
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX3		; byte swap
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+
+	ADD	$TX3,$T,$A		; A=T+Xi
+||	STW	$TX3,*${XPB}++
+||	BDEC	body_00_13?,B0
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_14
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
+||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_15
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+;;==================================================
+||	B	body_16_19?		; BODY_16_19
+||	MVK	1,B0
+
+body_16_19?:
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	BDEC	body_16_19?,B0
+
+	MVKL	0x6ed9eba1,$K
+||	MVK	17,B0
+	MVKH	0x6ed9eba1,$K		; K_20_39
+___
+sub BODY_20_39 {
+my $label = shift;
+$code.=<<___;
+;;==================================================
+||	B	$label			; BODY_20_39
+
+$label:
+	ROTL	$A,5,$Arot
+||	XOR	$B,$C,$F
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$D,$F,$F		; F_20_39(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++		; last one is redundant
+||	XOR	$TX0,$TX1,$TX1
+||	BDEC	$label,B0
+___
+}	&BODY_20_39("body_20_39?");
+$code.=<<___;
+;;==================================================
+	MVKL	0x8f1bbcdc,$K
+||	MVK	17,B0
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+||	B	body_40_59?		; BODY_40_59
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+
+body_40_59?:
+	ROTL	$A,5,$Arot
+||	XOR	$F0,$F,$F
+||	AND	$C,$D,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+||	BDEC	body_40_59?,B0
+
+	MVKL	0xca62c1d6,$K
+||	MVK	16,B0
+	MVKH	0xca62c1d6,$K		; K_60_79
+___
+	&BODY_20_39("body_60_78?");	# BODY_60_78
+$code.=<<___;
+;;==================================================
+   [A0]	B	loop?
+||	ROTL	$A,5,$Arot		; BODY_79
+||	XOR	$B,$C,$F
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
+||	ADD	$K,$E,$T		; T=E+K
+||	XOR	$D,$F,$F		; F_20_39(B,C,D)
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
+||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+
+	ADD	$Actx,$A,$A		; A+=Actx
+||	ADD	$Cctx,$C,$C		; C+=Cctx
+;; end of loop?
+
+	BNOP	RA			; return
+||	MV	FP,SP			; restore stack pointer
+||	LDW	*FP[0],FP		; restore frame pointer
+	STW	$A,*${CTX}[0]		; emit A-E...
+||	MVK	0,B0
+	STW	$B,*${CTX}[1]
+||	MVC	B0,AMR			; clear AMR
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl
new file mode 100644
index 0000000000..87000d1e8f
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64xplus.pl
@@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x+.
+#
+# November 2011
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Fully unrolled assembler would be ~5x larger and is likely to be
+# ~15% faster. It would be free from references to intermediate ring
+# buffer, but put more pressure on L1P [both because the code would be
+# larger and won't be using SPLOOP buffer]. There are no plans to
+# realize fully unrolled variant though...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5");			# X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+$code=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0			; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	LDW	*${CTX}[0],$A		; load A-E...
+|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
+   [A0]	LDW	*${CTX}[1],$B
+|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
+   [A0]	LDW	*${CTX}[2],$C
+|| [A0]	MVK	0x00404,B0
+   [A0]	LDW	*${CTX}[3],$D
+|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
+   [A0]	LDW	*${CTX}[4],$E
+|| [A0]	MVC	B0,AMR			; setup circular addressing
+	LDNW	*${INP}++,$TX1		; pre-fetch input
+	NOP	1
+
+loop?:
+	MVK	0x00007999,$K
+||	ADDAW	SP,2,$XPA
+||	SUB	A0,1,A0
+||	MVK	13,B0
+	MVKH	0x5a820000,$K		; K_00_19
+||	ADDAW	SP,2,$XPB
+||	MV	$A,$Actx
+||	MV	$B,$Bctx
+;;==================================================
+	SPLOOPD	5			; BODY_00_13
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MV	$E,$Ectx
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX3		; byte swap
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+
+	ADD	$TX3,$T,$A		; A=T+Xi
+||	STW	$TX3,*${XPB}++
+	SPKERNEL
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_14
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
+||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_15
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	MVK	3,B0
+;;==================================================
+	SPLOOPD	5			; BODY_16_19
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+	SPKERNEL
+
+	MVK	0xffffeba1,$K
+||	MVK	19,B0
+	MVKH	0x6ed90000,$K		; K_20_39
+___
+sub BODY_20_39 {
+$code.=<<___;
+;;==================================================
+	SPLOOPD	5			; BODY_20_39
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	XOR	$B,$C,$F
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$D,$F,$F		; F_20_39(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++		; last one is redundant
+||	XOR	$TX0,$TX1,$TX1
+	SPKERNEL
+___
+$code.=<<___ if (!shift);
+	MVK	0xffffbcdc,$K
+	MVKH	0x8f1b0000,$K		; K_40_59
+___
+}	&BODY_20_39();
+$code.=<<___;
+;;==================================================
+	SPLOOPD	5			; BODY_40_59
+||	MVC	B0,ILC
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+
+	ROTL	$A,5,$Arot
+||	XOR	$F0,$F,$F
+||	AND	$C,$D,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+	SPKERNEL
+
+	MVK	0xffffc1d6,$K
+||	MVK	18,B0
+	MVKH	0xca620000,$K		; K_60_79
+___
+	&BODY_20_39(-1);		# BODY_60_78
+$code.=<<___;
+;;==================================================
+   [A0]	B	loop?
+||	ROTL	$A,5,$Arot		; BODY_79
+||	XOR	$B,$C,$F
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
+||	ADD	$K,$E,$T		; T=E+K
+||	XOR	$D,$F,$F		; F_20_39(B,C,D)
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
+||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+
+	ADD	$Actx,$A,$A		; A+=Actx
+||	ADD	$Cctx,$C,$C		; C+=Cctx
+;; end of loop?
+
+	BNOP	RA			; return
+||	MV	FP,SP			; restore stack pointer
+||	LDW	*FP[0],FP		; restore frame pointer
+	STW	$A,*${CTX}[0]		; emit A-E...
+||	MVK	0,B0
+	STW	$B,*${CTX}[1]
+||	MVC	B0,AMR			; clear AMR
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
index f1a702f38f..ca50e1b1ee 100644
--- a/crypto/sha/asm/sha1-mips.pl
+++ b/crypto/sha/asm/sha1-mips.pl
@@ -42,7 +42,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
 
 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -64,7 +64,7 @@ if ($flavour =~ /64|n32/i) {
 #
 ######################################################################
 
-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
 
 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
 open STDOUT,">$output";
diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index 2140dd2f8d..7b86e2bc11 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -210,7 +210,7 @@ Lunaligned:
 	srwi.	$t1,$t1,6	; t1/=64
 	beq	Lcross_page
 	$UCMP	$num,$t1
-	ble-	Laligned	; didn't cross the page boundary
+	ble	Laligned	; didn't cross the page boundary
 	mtctr	$t1
 	subfc	$num,$t1,$num
 	bl	Lsha1_block_private
@@ -238,7 +238,7 @@ Lmemcpy:
 	bl	Lsha1_block_private
 	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
 	addic.	$num,$num,-1
-	bne-	Lunaligned
+	bne	Lunaligned
 
 Ldone:
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
@@ -312,7 +312,7 @@ $code.=<<___;
 	stw	r20,16($ctx)
 	mr	$E,r20
 	addi	$inp,$inp,`16*4`
-	bdnz-	Lsha1_block_private
+	bdnz	Lsha1_block_private
 	blr
 	.long	0
 	.byte	0,12,0x14,0,0,0,0,0
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 9c84e8d93c..252a583d06 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -23,8 +23,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 16%
 # improvement on Cortex A8 core and ~17 cycles per processed byte.
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";	$t0="r0";
 $inp="r1";	$t3="r1";
diff --git a/crypto/sha/asm/sha256-c64x.pl b/crypto/sha/asm/sha256-c64x.pl
new file mode 100644
index 0000000000..fbe99c0b7f
--- /dev/null
+++ b/crypto/sha/asm/sha256-c64x.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x.
+#
+# November 2016
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+	=map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+	=map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5");			# circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha256_block_data_order,_sha256_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	SWAP2,MV
+	.asg	SWAP4,MV
+	.endif
+
+	.global	_sha256_block_data_order
+_sha256_block_data_order:
+__sha256_block:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	ADDKPC	_sha256_block_data_order,B2
+|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
+	.if	__TI_EABI__
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	\$PCR_OFFSET(K256,__sha256_block),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	\$PCR_OFFSET(K256,__sha256_block),$K256
+	.else
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	(K256-__sha256_block),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	(K256-__sha256_block),$K256
+	.endif
+   [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	MV	SP,$Xia
+   [A0]	MV	SP,$Xib
+|| [A0]	ADD	B2,$K256,$K256
+|| [A0]	MV	$CTXA,$CTXB
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	LDW	*${CTXA}[0],$A			; load ctx
+||	LDW	*${CTXB}[4],$E
+	LDW	*${CTXA}[1],$B
+||	LDW	*${CTXB}[5],$F
+	LDW	*${CTXA}[2],$C
+||	LDW	*${CTXB}[6],$G
+	LDW	*${CTXA}[3],$D
+||	LDW	*${CTXB}[7],$H
+
+	LDNW	*$INP++,$Xn			; pre-fetch input
+	LDW	*$K256++,$K			; pre-fetch K256[0]
+	NOP
+	ADDAW	$Xia,9,$Xia
+outerloop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MV	$E,$Ectx
+||	MVD	$B,$Bctx
+||	MVD	$F,$Fctx
+	MV	$C,$Cctx
+||	MV	$G,$Gctx
+||	MVD	$D,$Dctx
+||	MVD	$H,$Hctx
+||	SWAP4	$Xn,$X0
+
+	MVK	14,B0				; loop counter
+||	SWAP2	$X0,$X0
+
+loop_00_14?:					; BODY_00_14
+	LDNW	*$INP++,$Xn
+||	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+|| [B0]	BDEC	loop_00_14?,B0
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X14
+||	SWAP4	$Xn,$X0
+	SWAP2	$X0,$X0
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+	MV	$B,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+;;===== branch to loop00_14? is taken here
+
+	ROTL	$A,30,$S0			; BODY_15
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	LDW	*${Xib}[2],$X1			; modulo-scheduled
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X15
+	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; modulo-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+||	ROTL	$B,0,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+
+	MVK	47,B1				; loop counter
+||	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+
+loop_16_63?:					; BODY_16_63
+	XOR	$t0e,$s0,$s0
+||	XOR	$t0a,$s1,$s1
+||	MV	$X15,$X14
+||	MV	$X1,$Xn
+	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
+||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
+||	LDW	*${Xib}[2],$X1			; module-scheduled
+	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$H,$K,$T1			; T1 = h + K256[i]
+||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
+|| [B1]	BDEC	loop_16_63?,B1
+	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+||	ADD	$X0,$T1,$T1			; T1 += X[i]
+||	STW	$X0,*$Xib++
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+||	MV	$X0,$X15
+||	ROTL	$G,0,$H				; h = g
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	MV	$F,$G				; g = f
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*++$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; module-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$B,$C				; c = b
+	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+||	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+;;===== branch to loop16_63? is taken here
+
+   [A0]	B	outerloop?
+|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
+|| [A0]	ADDK	-260,$K256			; rewind K256
+||	ADD	$Actx,$A,$A			; accumulate ctx
+||	ADD	$Ectx,$E,$E
+||	ADD	$Bctx,$B,$B
+	ADD	$Fctx,$F,$F
+||	ADD	$Cctx,$C,$C
+||	ADD	$Gctx,$G,$G
+||	ADD	$Dctx,$D,$D
+||	ADD	$Hctx,$H,$H
+|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
+
+  [!A0]	BNOP	RA
+||[!A0]	MV	$CTXA,$CTXB
+  [!A0]	MV	FP,SP				; restore stack pointer
+||[!A0]	LDW	*FP[0],FP			; restore frame pointer
+  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
+||[!A0]	STW	$E,*${CTXB}[4]
+||[!A0]	MVK	0,B0
+  [!A0]	STW	$B,*${CTXA}[1]
+||[!A0]	STW	$F,*${CTXB}[5]
+||[!A0]	MVC	B0,AMR				; clear AMR
+	STW	$C,*${CTXA}[2]
+||	STW	$G,*${CTXB}[6]
+	STW	$D,*${CTXA}[3]
+||	STW	$H,*${CTXB}[7]
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
+	.sect	".const:sha_asm"
+	.endif
+	.align	128
+K256:
+	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	.cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+
+___
+
+print $code;
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl
new file mode 100644
index 0000000000..8b92c84555
--- /dev/null
+++ b/crypto/sha/asm/sha256-c64xplus.pl
@@ -0,0 +1,292 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x+.
+#
+# January 2012
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+	=map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+	=map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5");			# circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	SWAP2,MV
+	.asg	SWAP4,MV
+	.endif
+
+	.global	_sha256_block_data_order
+_sha256_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	ADDKPC	_sha256_block_data_order,B2
+|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	(K256-_sha256_block_data_order),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	(K256-_sha256_block_data_order),$K256
+   [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	MV	SP,$Xia
+   [A0]	MV	SP,$Xib
+|| [A0]	ADD	B2,$K256,$K256
+|| [A0]	MV	$CTXA,$CTXB
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	LDW	*${CTXA}[0],$A			; load ctx
+||	LDW	*${CTXB}[4],$E
+	LDW	*${CTXA}[1],$B
+||	LDW	*${CTXB}[5],$F
+	LDW	*${CTXA}[2],$C
+||	LDW	*${CTXB}[6],$G
+	LDW	*${CTXA}[3],$D
+||	LDW	*${CTXB}[7],$H
+
+	LDNW	*$INP++,$Xn			; pre-fetch input
+	LDW	*$K256++,$K			; pre-fetch K256[0]
+	MVK	14,B0				; loop counters
+	MVK	47,B1
+||	ADDAW	$Xia,9,$Xia
+outerloop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MV	$E,$Ectx
+||	MVD	$B,$Bctx
+||	MVD	$F,$Fctx
+	MV	$C,$Cctx
+||	MV	$G,$Gctx
+||	MVD	$D,$Dctx
+||	MVD	$H,$Hctx
+||	SWAP4	$Xn,$X0
+
+	SPLOOPD	8				; BODY_00_14
+||	MVC	B0,ILC
+||	SWAP2	$X0,$X0
+
+	LDNW	*$INP++,$Xn
+||	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X14
+||	SWAP4	$Xn,$X0
+	SWAP2	$X0,$X0
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+	MV	$B,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+	SPKERNEL
+
+	ROTL	$A,30,$S0			; BODY_15
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	LDW	*${Xib}[2],$X1			; modulo-scheduled
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X15
+	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; modulo-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+||	ROTL	$B,0,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+
+	SPLOOPD	10				; BODY_16_63
+||	MVC	B1,ILC
+||	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+
+	XOR	$t0e,$s0,$s0
+||	XOR	$t0a,$s1,$s1
+||	MV	$X15,$X14
+||	MV	$X1,$Xn
+	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
+||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
+||	LDW	*${Xib}[2],$X1			; module-scheduled
+	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$H,$K,$T1			; T1 = h + K256[i]
+||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
+	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+||	ADD	$X0,$T1,$T1			; T1 += X[i]
+||	STW	$X0,*$Xib++
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+||	MV	$X0,$X15
+||	ROTL	$G,0,$H				; h = g
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	MV	$F,$G				; g = f
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*++$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; module-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$B,$C				; c = b
+	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+||	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+	SPKERNEL
+
+   [A0]	B	outerloop?
+|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
+|| [A0]	ADDK	-260,$K256			; rewind K256
+||	ADD	$Actx,$A,$A			; accumulate ctx
+||	ADD	$Ectx,$E,$E
+||	ADD	$Bctx,$B,$B
+	ADD	$Fctx,$F,$F
+||	ADD	$Cctx,$C,$C
+||	ADD	$Gctx,$G,$G
+||	ADD	$Dctx,$D,$D
+||	ADD	$Hctx,$H,$H
+|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
+
+  [!A0]	BNOP	RA
+||[!A0]	MV	$CTXA,$CTXB
+  [!A0]	MV	FP,SP				; restore stack pointer
+||[!A0]	LDW	*FP[0],FP			; restore frame pointer
+  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
+||[!A0]	STW	$E,*${CTXB}[4]
+||[!A0]	MVK	0,B0
+  [!A0]	STW	$B,*${CTXA}[1]
+||[!A0]	STW	$F,*${CTXB}[5]
+||[!A0]	MVC	B0,AMR				; clear AMR
+	STW	$C,*${CTXA}[2]
+||	STW	$G,*${CTXB}[6]
+	STW	$D,*${CTXA}[3]
+||	STW	$H,*${CTXB}[7]
+	.endasmfunc
+
+	.sect	".const:sha_asm"
+	.align	128
+K256:
+	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+
+___
+
+print $code;
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7faf37b147..c032afdbca 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -38,8 +38,20 @@ $hi="HI";
 $lo="LO";
 # ====================================================================
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $ctx="r0";	# parameter block
 $inp="r1";
@@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha512_block_data_order
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
 .skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
+.Lsha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#1
 	bne	.LNEON
 #endif
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
new file mode 100644
index 0000000000..45eb719fe5
--- /dev/null
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -0,0 +1,428 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.6 (+110%)	10.1 (+190%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# 
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significanty faster
+#	and lags behind assembly only by 50-90%.
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__ARMEB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+___
+$code.=<<___	if ($SZ==4);
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+___
+$code.=<<___;
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adr	$Ktbl,.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+.size	$func,.-$func
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+.align	3
+.LOPENSSL_armcap_P:
+	.quad	OPENSSL_armcap_P-.
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+___
+}
+
+$code.=<<___;
+.comm	OPENSSL_armcap_P,4,4
+___
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-c64x.pl b/crypto/sha/asm/sha512-c64x.pl
new file mode 100644
index 0000000000..e35a72ade5
--- /dev/null
+++ b/crypto/sha/asm/sha512-c64x.pl
@@ -0,0 +1,437 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x.
+#
+# November 2016
+#
+# Performance is ~19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi,         $T2hi)=         ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5");			# circular/ring buffer
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha512_block_data_order,_sha512_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	$Khi,KHI
+	.asg	$Klo,KLO
+	.else
+	.asg	$Khi,KLO
+	.asg	$Klo,KHI
+	.endif
+
+	.global	_sha512_block_data_order
+_sha512_block_data_order:
+__sha512_block:
+	.asmfunc stack_usage(40+128)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-128,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--(40)			; save frame pointer
+|| [A0]	MV	SP,FP
+   [A0]	STDW	B13:B12,*SP[4]
+|| [A0]	MVK	0x00404,B1
+   [A0]	STDW	B11:B10,*SP[3]
+|| [A0]	STDW	A13:A12,*FP[-3]
+|| [A0]	MVKH	0x60000,B1
+   [A0]	STDW	A11:A10,*SP[1]
+|| [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	ADD	B0,SP,SP			; alloca(128)
+	.if	__TI_EABI__
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	\$PCR_OFFSET(K512,__sha512_block),$K512
+   [A0]	MVKH	\$PCR_OFFSET(K512,__sha512_block),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.else
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	(K512-__sha512_block),$K512
+   [A0]	MVKH	(K512-__sha512_block),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.endif
+	ADDAW	SP,3,$Xilo
+	ADD	SP,4*2,$Xihi			; ADDAW	SP,2,$Xihi
+
+||	MV	$CTXA,$CTXB
+	LDW	*${CTXA}[0^.LITTLE_ENDIAN],$Ahi	; load ctx
+||	LDW	*${CTXB}[1^.LITTLE_ENDIAN],$Alo
+||	ADD	B1,$K512,$K512
+	LDW	*${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+||	LDW	*${CTXB}[3^.LITTLE_ENDIAN],$Blo
+	LDW	*${CTXA}[4^.LITTLE_ENDIAN],$Chi
+||	LDW	*${CTXB}[5^.LITTLE_ENDIAN],$Clo
+	LDW	*${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+||	LDW	*${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+	LDW	*${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+||	LDW	*${CTXB}[9^.LITTLE_ENDIAN],$Elo
+	LDW	*${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+||	LDW	*${CTXB}[11^.LITTLE_ENDIAN],$Flo
+	LDW	*${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+||	LDW	*${CTXB}[13^.LITTLE_ENDIAN],$Glo
+	LDW	*${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+||	LDW	*${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+	LDNDW	*$INP++,B11:B10			; pre-fetch input
+	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+outerloop?:
+	MVK	15,B0				; loop counters
+||	MVK	64,B1
+||	SUB	A0,1,A0
+	MV	$Ahi,$Actxhi
+||	MV	$Alo,$Actxlo
+||	MV	$Bhi,$Bctxhi
+||	MV	$Blo,$Bctxlo
+||	MV	$Chi,$Cctxhi
+||	MV	$Clo,$Cctxlo
+||	MVD	$Dhi,$Dctxhi
+||	MVD	$Dlo,$Dctxlo
+	MV	$Ehi,$Ectxhi
+||	MV	$Elo,$Ectxlo
+||	MV	$Fhi,$Fctxhi
+||	MV	$Flo,$Fctxlo
+||	MV	$Ghi,$Gctxhi
+||	MV	$Glo,$Gctxlo
+||	MVD	$Hhi,$Hctxhi
+||	MVD	$Hlo,$Hctxlo
+loop0_15?:
+	.if	.BIG_ENDIAN
+	MV	B11,$T1hi
+||	MV	B10,$T1lo
+	.else
+	SWAP4	B10,$T1hi
+||	SWAP4	B11,$T1lo
+	SWAP2	$T1hi,$T1hi
+||	SWAP2	$T1lo,$T1lo
+	.endif
+	STW	$T1hi,*$Xihi++[2]			; original loop16_79?
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+loop16_79?:
+	XOR	$Fhi,$Ghi,$CHhi
+||	XOR	$Flo,$Glo,$CHlo
+||	ADD	KHI,$T1hi,$T1hi
+||	ADDU	KLO,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += K512[i]
+||	SHRU	$Elo,14,$t0lo
+||	SHL	$Elo,32-14,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Ehi,$CHhi,$CHhi
+||	AND	$Elo,$CHlo,$CHlo
+||	ROTL	$Ghi,0,$Hhi
+||	ROTL	$Glo,0,$Hlo				; h = g
+||	SHRU	$Ehi,18,$t0hi
+||	SHL	$Ehi,32-18,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	XOR	$Ghi,$CHhi,$CHhi
+||	XOR	$Glo,$CHlo,$CHlo			; Ch(e,f,g) = ((f^g)&e)^g
+||	ROTL	$Fhi,0,$Ghi
+||	ROTL	$Flo,0,$Glo				; g = f
+||	SHRU	$Elo,18,$t0lo
+||	SHL	$Elo,32-18,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	OR	$Ahi,$Bhi,$MAJhi
+||	OR	$Alo,$Blo,$MAJlo
+||	ROTL	$Ehi,0,$Fhi
+||	ROTL	$Elo,0,$Flo				; f = e
+||	SHRU	$Ehi,41-32,$t0lo
+||	SHL	$Ehi,64-41,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Chi,$MAJhi,$MAJhi
+||	AND	$Clo,$MAJlo,$MAJlo
+||	ROTL	$Dhi,0,$Ehi
+||	ROTL	$Dlo,0,$Elo				; e = d
+||	SHRU	$Elo,41-32,$t0hi
+||	SHL	$Elo,64-41,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo			; Sigma1(e)
+||	AND	$Ahi,$Bhi,$t1hi
+||	AND	$Alo,$Blo,$t1lo
+||	ROTL	$Chi,0,$Dhi
+||	ROTL	$Clo,0,$Dlo				; d = c
+||	SHRU	$Ahi,28,$S0hi
+||	SHL	$Ahi,32-28,$S0lo
+	OR	$t1hi,$MAJhi,$MAJhi
+||	OR	$t1lo,$MAJlo,$MAJlo			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Ch(e,f,g)
+||	ROTL	$Bhi,0,$Chi
+||	ROTL	$Blo,0,$Clo				; c = b
+||	SHRU	$Alo,28,$t0lo
+||	SHL	$Alo,32-28,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Sigma1(e)
+||	ROTL	$Ahi,0,$Bhi
+||	ROTL	$Alo,0,$Blo				; b = a
+||	SHRU	$Ahi,34-32,$t0lo
+||	SHL	$Ahi,64-34,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$MAJhi,$T1hi,$T2hi
+||	ADDU	$MAJlo,$T1carry:$T1lo,$T2carry:$T2lo	; T2 = T1+Maj(a,b,c)
+||	SHRU	$Alo,34-32,$t0hi
+||	SHL	$Alo,64-34,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$Ehi,$T1hi,$T1hi
+||	ADDU	$Elo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += e
+||	SHRU	$Ahi,39-32,$t0lo
+||	SHL	$Ahi,64-39,$t0hi
+   [B0]	BNOP	loop0_15?
+|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$Alo,39-32,$t0hi
+||	SHL	$Alo,64-39,$t0lo
+||[!B0]	LDW	*${Xihi}[28],$T1hi
+||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo			; Sigma0(a)
+||	ADD	$T1carry,$T1hi,$Ehi
+||	ROTL	$T1lo,0,$Elo				; e = T1, "ghost" value
+||[!B1]	BNOP	break?
+	ADD	$S0hi,$T2hi,$T2hi
+||	ADDU	$S0lo,$T2carry:$T2lo,$T2carry:$T2lo	; T2 += Sigma0(a)
+|| [B1]	LDDW	*$K512++,$Khi:$Klo			; pre-fetch K512[i]
+	NOP						; avoid cross-path stall
+	ADD	$T2carry,$T2hi,$Ahi
+||	MV	$T2lo,$Alo				; a = T2
+|| [B0]	SUB	B0,1,B0
+;;===== branch to loop00_15? is taken here
+   [B1]	LDW	*${Xihi}[2],$T2hi
+|| [B1]	LDW	*${Xilo}[2],$T2lo			; X[i+1]
+|| [B1]	SHRU	$T1hi,19,$S1hi
+|| [B1]	SHL	$T1hi,32-19,$S1lo
+   [B1]	SHRU	$T1lo,19,$t0lo
+|| [B1]	SHL	$T1lo,32-19,$t0hi
+;;===== branch to break? is taken here
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,61-32,$t0lo
+||	SHL	$T1hi,64-61,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,61-32,$t0hi
+||	SHL	$T1lo,64-61,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,6,$t0hi
+||	SHL	$T1hi,32-6,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,6,$t0lo
+||	LDW	*${Xihi}[18],$T1hi
+||	LDW	*${Xilo}[18],$T1lo			; X[i+9]
+	XOR	$t0lo,$S1lo,$S1lo			; sigma1(Xi[i+14])
+
+||	LDW	*${Xihi}[0],$CHhi
+||	LDW	*${Xilo}[0],$CHlo			; X[i]
+||	SHRU	$T2hi,1,$S0hi
+||	SHL	$T2hi,32-1,$S0lo
+	SHRU	$T2lo,1,$t0lo
+||	SHL	$T2lo,32-1,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2hi,8,$t0hi
+||	SHL	$T2hi,32-8,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2lo,8,$t0lo
+||	SHL	$T2lo,32-8,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1lo,$T1carry:$T1lo		; T1 = X[i+9]+sigma1()
+||	SHRU	$T2hi,7,$t0hi
+||	SHL	$T2hi,32-7,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += X[i]
+||	SHRU	$T2lo,7,$t0lo
+|| [B1]	BNOP	loop16_79?
+	XOR	$t0lo,$S0lo,$S0lo			; sigma0(Xi[i+1]
+
+	ADD	$S0hi,$T1hi,$T1hi
+||	ADDU	$S0lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += sigma0()
+|| [B1]	SUB	B1,1,B1
+	NOP						; avoid cross-path stall
+	ADD	$T1carry,$T1hi,$T1hi
+
+   	STW	$T1hi,*$Xihi++[2]			; copied "top" bundle
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+;;===== branch to loop16_79? is taken here
+
+break?:
+	ADD	$Ahi,$Actxhi,$Ahi		; accumulate ctx
+||	ADDU	$Alo,$Actxlo,$Actxlo:$Alo
+|| [A0]	LDNDW	*$INP++,B11:B10			; pre-fetch input
+|| [A0]	ADDK	-640,$K512			; rewind pointer to K512
+	ADD	$Bhi,$Bctxhi,$Bhi
+||	ADDU	$Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0]	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+	ADD	$Chi,$Cctxhi,$Chi
+||	ADDU	$Clo,$Cctxlo,$Cctxlo:$Clo
+||	ADD	$Actxlo,$Ahi,$Ahi
+||[!A0]	MV	$CTXA,$CTXB
+	ADD	$Dhi,$Dctxhi,$Dhi
+||	ADDU	$Dlo,$Dctxlo,$Dctxlo:$Dlo
+||	ADD	$Bctxlo,$Bhi,$Bhi
+||[!A0]	STW	$Ahi,*${CTXA}[0^.LITTLE_ENDIAN]	; save ctx
+||[!A0]	STW	$Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+	ADD	$Ehi,$Ectxhi,$Ehi
+||	ADDU	$Elo,$Ectxlo,$Ectxlo:$Elo
+||	ADD	$Cctxlo,$Chi,$Chi
+|| [A0]	BNOP	outerloop?
+||[!A0]	STW	$Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0]	STW	$Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+	ADD	$Fhi,$Fctxhi,$Fhi
+||	ADDU	$Flo,$Fctxlo,$Fctxlo:$Flo
+||	ADD	$Dctxlo,$Dhi,$Dhi
+||[!A0]	STW	$Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0]	STW	$Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+	ADD	$Ghi,$Gctxhi,$Ghi
+||	ADDU	$Glo,$Gctxlo,$Gctxlo:$Glo
+||	ADD	$Ectxlo,$Ehi,$Ehi
+||[!A0]	STW	$Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0]	STW	$Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+	ADD	$Hhi,$Hctxhi,$Hhi
+||	ADDU	$Hlo,$Hctxlo,$Hctxlo:$Hlo
+||	ADD	$Fctxlo,$Fhi,$Fhi
+||[!A0]	STW	$Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0]	STW	$Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+	ADD	$Gctxlo,$Ghi,$Ghi
+||[!A0]	STW	$Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0]	STW	$Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+	ADD	$Hctxlo,$Hhi,$Hhi
+||[!A0]	STW	$Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0]	STW	$Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+	STW	$Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+||	STW	$Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+||	MVK	-40,B0
+	ADD	FP,B0,SP			; destroy circular buffer
+||	LDDW	*FP[-4],A11:A10
+	LDDW	*SP[2],A13:A12
+||	LDDW	*FP[-2],B11:B10
+	LDDW	*SP[4],B13:B12
+||	BNOP	RA
+	LDW	*++SP(40),FP			; restore frame pointer
+	MVK	0,B0
+	MVC	B0,AMR				; clear AMR
+	NOP	2				; wait till FP is committed
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
+	.sect	".const:sha_asm"
+	.endif
+	.align	128
+K512:
+	.uword	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+	.uword	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+	.uword	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+	.uword	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+	.uword	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+	.uword	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+	.uword	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+	.uword	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+	.uword	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+	.uword	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+	.uword	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+	.uword	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+	.uword	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+	.uword	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+	.uword	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+	.uword	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+	.uword	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+	.uword	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+	.uword	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+	.uword	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+	.uword	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+	.uword	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+	.uword	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+	.uword	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+	.uword	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+	.uword	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+	.uword	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+	.uword	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+	.uword	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+	.uword	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+	.uword	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+	.uword	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+	.uword	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+	.uword	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+	.uword	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+	.uword	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+	.uword	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+	.uword	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+	.uword	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+	.uword	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+	.cstring "SHA512 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl
new file mode 100644
index 0000000000..56c8583bf3
--- /dev/null
+++ b/crypto/sha/asm/sha512-c64xplus.pl
@@ -0,0 +1,410 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x+.
+#
+# January 2012
+#
+# Performance is 19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi,         $T2hi)=         ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5");			# circular/ring buffer
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	$Khi,KHI
+	.asg	$Klo,KLO
+	.else
+	.asg	$Khi,KLO
+	.asg	$Klo,KHI
+	.endif
+
+	.global	_sha512_block_data_order
+_sha512_block_data_order:
+	.asmfunc stack_usage(40+128)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-128,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--(40)			; save frame pointer
+|| [A0]	MV	SP,FP
+   [A0]	STDW	B13:B12,*SP[4]
+|| [A0]	MVK	0x00404,B1
+   [A0]	STDW	B11:B10,*SP[3]
+|| [A0]	STDW	A13:A12,*FP[-3]
+|| [A0]	MVKH	0x60000,B1
+   [A0]	STDW	A11:A10,*SP[1]
+|| [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	ADD	B0,SP,SP			; alloca(128)
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	_sha512_block_data_order,B1
+|| [A0]	MVKL	(K512-_sha512_block_data_order),$K512
+   [A0]	MVKH	(K512-_sha512_block_data_order),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	ADDAW	SP,3,$Xilo
+	ADDAW	SP,2,$Xihi
+
+||	MV	$CTXA,$CTXB
+	LDW	*${CTXA}[0^.LITTLE_ENDIAN],$Ahi	; load ctx
+||	LDW	*${CTXB}[1^.LITTLE_ENDIAN],$Alo
+||	ADD	B1,$K512,$K512
+	LDW	*${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+||	LDW	*${CTXB}[3^.LITTLE_ENDIAN],$Blo
+	LDW	*${CTXA}[4^.LITTLE_ENDIAN],$Chi
+||	LDW	*${CTXB}[5^.LITTLE_ENDIAN],$Clo
+	LDW	*${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+||	LDW	*${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+	LDW	*${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+||	LDW	*${CTXB}[9^.LITTLE_ENDIAN],$Elo
+	LDW	*${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+||	LDW	*${CTXB}[11^.LITTLE_ENDIAN],$Flo
+	LDW	*${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+||	LDW	*${CTXB}[13^.LITTLE_ENDIAN],$Glo
+	LDW	*${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+||	LDW	*${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+	LDNDW	*$INP++,B11:B10			; pre-fetch input
+	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+outerloop?:
+	MVK	15,B0				; loop counters
+||	MVK	64,B1
+||	SUB	A0,1,A0
+	MV	$Ahi,$Actxhi
+||	MV	$Alo,$Actxlo
+||	MV	$Bhi,$Bctxhi
+||	MV	$Blo,$Bctxlo
+||	MV	$Chi,$Cctxhi
+||	MV	$Clo,$Cctxlo
+||	MVD	$Dhi,$Dctxhi
+||	MVD	$Dlo,$Dctxlo
+	MV	$Ehi,$Ectxhi
+||	MV	$Elo,$Ectxlo
+||	MV	$Fhi,$Fctxhi
+||	MV	$Flo,$Fctxlo
+||	MV	$Ghi,$Gctxhi
+||	MV	$Glo,$Gctxlo
+||	MVD	$Hhi,$Hctxhi
+||	MVD	$Hlo,$Hctxlo
+loop0_15?:
+	.if	.BIG_ENDIAN
+	MV	B11,$T1hi
+||	MV	B10,$T1lo
+	.else
+	SWAP4	B10,$T1hi
+||	SWAP4	B11,$T1lo
+	SWAP2	$T1hi,$T1hi
+||	SWAP2	$T1lo,$T1lo
+	.endif
+loop16_79?:
+	STW	$T1hi,*$Xihi++[2]
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+	XOR	$Fhi,$Ghi,$CHhi
+||	XOR	$Flo,$Glo,$CHlo
+||	ADD	KHI,$T1hi,$T1hi
+||	ADDU	KLO,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += K512[i]
+||	SHRU	$Elo,14,$t0lo
+||	SHL	$Elo,32-14,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Ehi,$CHhi,$CHhi
+||	AND	$Elo,$CHlo,$CHlo
+||	ROTL	$Ghi,0,$Hhi
+||	ROTL	$Glo,0,$Hlo				; h = g
+||	SHRU	$Ehi,18,$t0hi
+||	SHL	$Ehi,32-18,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	XOR	$Ghi,$CHhi,$CHhi
+||	XOR	$Glo,$CHlo,$CHlo			; Ch(e,f,g) = ((f^g)&e)^g
+||	ROTL	$Fhi,0,$Ghi
+||	ROTL	$Flo,0,$Glo				; g = f
+||	SHRU	$Elo,18,$t0lo
+||	SHL	$Elo,32-18,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	OR	$Ahi,$Bhi,$MAJhi
+||	OR	$Alo,$Blo,$MAJlo
+||	ROTL	$Ehi,0,$Fhi
+||	ROTL	$Elo,0,$Flo				; f = e
+||	SHRU	$Ehi,41-32,$t0lo
+||	SHL	$Ehi,64-41,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Chi,$MAJhi,$MAJhi
+||	AND	$Clo,$MAJlo,$MAJlo
+||	ROTL	$Dhi,0,$Ehi
+||	ROTL	$Dlo,0,$Elo				; e = d
+||	SHRU	$Elo,41-32,$t0hi
+||	SHL	$Elo,64-41,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo			; Sigma1(e)
+||	AND	$Ahi,$Bhi,$t1hi
+||	AND	$Alo,$Blo,$t1lo
+||	ROTL	$Chi,0,$Dhi
+||	ROTL	$Clo,0,$Dlo				; d = c
+||	SHRU	$Ahi,28,$S0hi
+||	SHL	$Ahi,32-28,$S0lo
+	OR	$t1hi,$MAJhi,$MAJhi
+||	OR	$t1lo,$MAJlo,$MAJlo			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Ch(e,f,g)
+||	ROTL	$Bhi,0,$Chi
+||	ROTL	$Blo,0,$Clo				; c = b
+||	SHRU	$Alo,28,$t0lo
+||	SHL	$Alo,32-28,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Sigma1(e)
+||	ROTL	$Ahi,0,$Bhi
+||	ROTL	$Alo,0,$Blo				; b = a
+||	SHRU	$Ahi,34-32,$t0lo
+||	SHL	$Ahi,64-34,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$MAJhi,$T1hi,$T2hi
+||	ADDU	$MAJlo,$T1carry:$T1lo,$T2carry:$T2lo	; T2 = T1+Maj(a,b,c)
+||	SHRU	$Alo,34-32,$t0hi
+||	SHL	$Alo,64-34,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$Ehi,$T1hi,$T1hi
+||	ADDU	$Elo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += e
+|| [B0]	BNOP	loop0_15?
+||	SHRU	$Ahi,39-32,$t0lo
+||	SHL	$Ahi,64-39,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
+||[!B1]	BNOP	break?
+||	SHRU	$Alo,39-32,$t0hi
+||	SHL	$Alo,64-39,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo			; Sigma0(a)
+||	ADD	$T1carry,$T1hi,$Ehi
+||	MV	$T1lo,$Elo				; e = T1
+||[!B0]	LDW	*${Xihi}[28],$T1hi
+||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
+	ADD	$S0hi,$T2hi,$T2hi
+||	ADDU	$S0lo,$T2carry:$T2lo,$T2carry:$T2lo	; T2 += Sigma0(a)
+|| [B1]	LDDW	*$K512++,$Khi:$Klo			; pre-fetch K512[i]
+	NOP						; avoid cross-path stall
+	ADD	$T2carry,$T2hi,$Ahi
+||	MV	$T2lo,$Alo				; a = T2
+|| [B0]	SUB	B0,1,B0
+;;===== branch to loop00_15? is taken here
+	NOP
+;;===== branch to break? is taken here
+	LDW	*${Xihi}[2],$T2hi
+||	LDW	*${Xilo}[2],$T2lo			; X[i+1]
+||	SHRU	$T1hi,19,$S1hi
+||	SHL	$T1hi,32-19,$S1lo
+	SHRU	$T1lo,19,$t0lo
+||	SHL	$T1lo,32-19,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,61-32,$t0lo
+||	SHL	$T1hi,64-61,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,61-32,$t0hi
+||	SHL	$T1lo,64-61,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,6,$t0hi
+||	SHL	$T1hi,32-6,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,6,$t0lo
+||	LDW	*${Xihi}[18],$T1hi
+||	LDW	*${Xilo}[18],$T1lo			; X[i+9]
+	XOR	$t0lo,$S1lo,$S1lo			; sigma1(Xi[i+14])
+
+||	LDW	*${Xihi}[0],$CHhi
+||	LDW	*${Xilo}[0],$CHlo			; X[i]
+||	SHRU	$T2hi,1,$S0hi
+||	SHL	$T2hi,32-1,$S0lo
+	SHRU	$T2lo,1,$t0lo
+||	SHL	$T2lo,32-1,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2hi,8,$t0hi
+||	SHL	$T2hi,32-8,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2lo,8,$t0lo
+||	SHL	$T2lo,32-8,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1lo,$T1carry:$T1lo		; T1 = X[i+9]+sigma1()
+|| [B1]	BNOP	loop16_79?
+||	SHRU	$T2hi,7,$t0hi
+||	SHL	$T2hi,32-7,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += X[i]
+||	SHRU	$T2lo,7,$t0lo
+	XOR	$t0lo,$S0lo,$S0lo			; sigma0(Xi[i+1]
+
+	ADD	$S0hi,$T1hi,$T1hi
+||	ADDU	$S0lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += sigma0()
+|| [B1]	SUB	B1,1,B1
+	NOP						; avoid cross-path stall
+	ADD	$T1carry,$T1hi,$T1hi
+;;===== branch to loop16_79? is taken here
+
+break?:
+	ADD	$Ahi,$Actxhi,$Ahi		; accumulate ctx
+||	ADDU	$Alo,$Actxlo,$Actxlo:$Alo
+|| [A0]	LDNDW	*$INP++,B11:B10			; pre-fetch input
+|| [A0]	ADDK	-640,$K512			; rewind pointer to K512
+	ADD	$Bhi,$Bctxhi,$Bhi
+||	ADDU	$Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0]	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+	ADD	$Chi,$Cctxhi,$Chi
+||	ADDU	$Clo,$Cctxlo,$Cctxlo:$Clo
+||	ADD	$Actxlo,$Ahi,$Ahi
+||[!A0]	MV	$CTXA,$CTXB
+	ADD	$Dhi,$Dctxhi,$Dhi
+||	ADDU	$Dlo,$Dctxlo,$Dctxlo:$Dlo
+||	ADD	$Bctxlo,$Bhi,$Bhi
+||[!A0]	STW	$Ahi,*${CTXA}[0^.LITTLE_ENDIAN]	; save ctx
+||[!A0]	STW	$Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+	ADD	$Ehi,$Ectxhi,$Ehi
+||	ADDU	$Elo,$Ectxlo,$Ectxlo:$Elo
+||	ADD	$Cctxlo,$Chi,$Chi
+|| [A0]	BNOP	outerloop?
+||[!A0]	STW	$Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0]	STW	$Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+	ADD	$Fhi,$Fctxhi,$Fhi
+||	ADDU	$Flo,$Fctxlo,$Fctxlo:$Flo
+||	ADD	$Dctxlo,$Dhi,$Dhi
+||[!A0]	STW	$Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0]	STW	$Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+	ADD	$Ghi,$Gctxhi,$Ghi
+||	ADDU	$Glo,$Gctxlo,$Gctxlo:$Glo
+||	ADD	$Ectxlo,$Ehi,$Ehi
+||[!A0]	STW	$Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0]	STW	$Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+	ADD	$Hhi,$Hctxhi,$Hhi
+||	ADDU	$Hlo,$Hctxlo,$Hctxlo:$Hlo
+||	ADD	$Fctxlo,$Fhi,$Fhi
+||[!A0]	STW	$Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0]	STW	$Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+	ADD	$Gctxlo,$Ghi,$Ghi
+||[!A0]	STW	$Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0]	STW	$Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+	ADD	$Hctxlo,$Hhi,$Hhi
+||[!A0]	STW	$Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0]	STW	$Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+	STW	$Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+||	STW	$Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+||	MVK	-40,B0
+	ADD	FP,B0,SP			; destroy circular buffer
+||	LDDW	*FP[-4],A11:A10
+	LDDW	*SP[2],A13:A12
+||	LDDW	*FP[-2],B11:B10
+	LDDW	*SP[4],B13:B12
+||	BNOP	RA
+	LDW	*++SP(40),FP			; restore frame pointer
+	MVK	0,B0
+	MVC	B0,AMR				; clear AMR
+	NOP	2				; wait till FP is committed
+	.endasmfunc
+
+	.sect	".const:sha_asm"
+	.align	128
+K512:
+	.uword	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+	.uword	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+	.uword	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+	.uword	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+	.uword	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+	.uword	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+	.uword	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+	.uword	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+	.uword	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+	.uword	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+	.uword	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+	.uword	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+	.uword	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+	.uword	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+	.uword	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+	.uword	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+	.uword	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+	.uword	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+	.uword	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+	.uword	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+	.uword	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+	.uword	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+	.uword	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+	.uword	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+	.uword	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+	.uword	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+	.uword	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+	.uword	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+	.uword	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+	.uword	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+	.uword	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+	.uword	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+	.uword	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+	.uword	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+	.uword	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+	.uword	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+	.uword	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+	.uword	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+	.uword	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+	.uword	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+	.cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
index ba5b250890..00e795b0ad 100644
--- a/crypto/sha/asm/sha512-mips.pl
+++ b/crypto/sha/asm/sha512-mips.pl
@@ -45,7 +45,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
 
 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@@ -68,7 +68,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
 #
 ######################################################################
 
-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;
 
 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
 open STDOUT,">$output";
@@ -244,7 +244,7 @@ $code.=<<___;
 
 .text
 .set	noat
-#if !defined(__vxworks) || defined(__pic__)
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
 .option	pic2
 #endif
 
@@ -351,7 +351,7 @@ $code.=<<___;
 	$ST	$G,6*$SZ($ctx)
 	$ST	$H,7*$SZ($ctx)
 
-	bnel	$inp,@X[15],.Loop
+	bne	$inp,@X[15],.Loop
 	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl
 
 	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
index 6b44a68e59..4051119e5d 100755
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@@ -64,7 +64,7 @@ die "can't locate ppc-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
 
 if ($output =~ /512/) {
-	$func="sha512_block_data_order";
+	$func="sha512_block_ppc";
 	$SZ=8;
 	@Sigma0=(28,34,39);
 	@Sigma1=(14,18,41);
@@ -76,7 +76,7 @@ if ($output =~ /512/) {
 	$ROR="rotrdi";
 	$SHR="srdi";
 } else {
-	$func="sha256_block_data_order";
+	$func="sha256_block_ppc";
 	$SZ=4;
 	@Sigma0=( 2,13,22);
 	@Sigma1=( 6,11,25);
@@ -243,7 +243,7 @@ Lunaligned:
 	andi.	$t1,$t1,`4096-16*$SZ`	; distance to closest page boundary
 	beq	Lcross_page
 	$UCMP	$num,$t1
-	ble-	Laligned		; didn't cross the page boundary
+	ble	Laligned		; didn't cross the page boundary
 	subfc	$num,$t1,$num
 	add	$t1,$inp,$t1
 	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real remaining num
@@ -279,7 +279,7 @@ Lmemcpy:
 	$POP	$inp,`$FRAME-$SIZE_T*26`($sp)	; restore real inp
 	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
 	addic.	$num,$num,`-16*$SZ`		; num--
-	bne-	Lunaligned
+	bne	Lunaligned
 
 Ldone:
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
@@ -339,7 +339,7 @@ for(;$i<32;$i++) {
 	unshift(@V,pop(@V));
 }
 $code.=<<___;
-	bdnz-	Lrounds
+	bdnz	Lrounds
 
 	$POP	$ctx,`$FRAME-$SIZE_T*22`($sp)
 	$POP	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
new file mode 100755
index 0000000000..038292055c
--- /dev/null
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -0,0 +1,431 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 for PowerISA v2.07.
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This module is
+# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
+# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
+# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
+# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
+# result is degree of computational resources' utilization. POWER8 is
+# "massively multi-threaded chip" and difference between single- and
+# maximum multi-process benchmark results tells that utlization is
+# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
+# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
+# to single-process one, given that all threads end up on the same
+# physical core.
+#
+#######################################################################
+#
+#		SHA256/pre-2.07(*)	SHA512/pre-2.07(*)	SHA1(*)
+# POWER8	9.3   /14.8		5.8   /9.5		7.1
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$LENDIAN=($flavour=~/le/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+if ($output =~ /512/) {
+	$bits=512;
+	$SZ=8;
+	$sz="d";
+	$rounds=80;
+} else {
+	$bits=256;
+	$SZ=4;
+	$sz="w";
+	$rounds=64;
+}
+
+$func="sha${bits}_block_p8";
+$FRAME=8*$SIZE_T;
+
+$sp ="r1";
+$toc="r2";
+$ctx="r3";
+$inp="r4";
+$num="r5";
+$Tbl="r6";
+$idx="r7";
+$lrsave="r8";
+$offload="r11";
+$vrsave="r12";
+($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
+@X=map("v$_",(8..23));
+($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+
+sub ROUND {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)%16;
+
+$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
+	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
+	addi		$inp,$inp,16
+___
+$code.=<<___		if ($i<16 && ($i%(16/$SZ)));
+	vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
+___
+$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
+	vperm		@X[$i],@X[$i],@X[$i],$lemask
+___
+$code.=<<___;
+	`"vshasigma${sz}	$s0,@X[($j+1)%16],0,0"		if ($i>=15)`
+	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
+	vshasigma${sz}	$S1,$e,1,15		; Sigma1(e)
+	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
+	vshasigma${sz}	$S0,$a,1,0		; Sigma0(a)
+	`"vshasigma${sz}	$s1,@X[($j+14)%16],0,15"	if ($i>=15)`
+	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
+	vxor		$Func,$a,$b
+	`"vaddu${sz}m		@X[$j],@X[$j],@X[($j+9)%16]"	if ($i>=15)`
+	vaddu${sz}m	$h,$h,$S1		; h+=Sigma1(e)
+	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
+	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
+	vaddu${sz}m	$d,$d,$h		; d+=h
+	vaddu${sz}m	$S0,$S0,$Func		; Sigma0(a)+Maj(a,b,c)
+	`"vaddu${sz}m		@X[$j],@X[$j],$s0"		if ($i>=15)`
+	lvx		$Ki,$idx,$Tbl		; load next K[i]
+	addi		$idx,$idx,16
+	vaddu${sz}m	$h,$h,$S0		; h+=Sigma0(a)+Maj(a,b,c)
+	`"vaddu${sz}m		@X[$j],@X[$j],$s1"		if ($i>=15)`
+___
+}
+
+$code=<<___;
+.machine	"any"
+.text
+
+.globl	$func
+.align	6
+$func:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		$lrsave
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	mfspr		$vrsave,256
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r11,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	$PUSH		$lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	mtspr		256,r11
+
+	bl		LPICmeup
+	addi		$offload,$sp,$FRAME+15
+___
+$code.=<<___		if ($LENDIAN);
+	li		$idx,8
+	lvsl		$lemask,0,$idx
+	vspltisb	$Ki,0x0f
+	vxor		$lemask,$lemask,$Ki
+___
+$code.=<<___		if ($SZ==4);
+	lvx_4w		$A,$x00,$ctx
+	lvx_4w		$E,$x10,$ctx
+	vsldoi		$B,$A,$A,4		# unpack
+	vsldoi		$C,$A,$A,8
+	vsldoi		$D,$A,$A,12
+	vsldoi		$F,$E,$E,4
+	vsldoi		$G,$E,$E,8
+	vsldoi		$H,$E,$E,12
+___
+$code.=<<___		if ($SZ==8);
+	lvx_u		$A,$x00,$ctx
+	lvx_u		$C,$x10,$ctx
+	lvx_u		$E,$x20,$ctx
+	vsldoi		$B,$A,$A,8		# unpack
+	lvx_u		$G,$x30,$ctx
+	vsldoi		$D,$C,$C,8
+	vsldoi		$F,$E,$E,8
+	vsldoi		$H,$G,$G,8
+___
+$code.=<<___;
+	li		r0,`($rounds-16)/16`	# inner loop counter
+	b		Loop
+.align	5
+Loop:
+	lvx		$Ki,$x00,$Tbl
+	li		$idx,16
+	lvx_u		@X[0],0,$inp
+	addi		$inp,$inp,16
+	stvx		$A,$x00,$offload	# offload $A-$H
+	stvx		$B,$x10,$offload
+	stvx		$C,$x20,$offload
+	stvx		$D,$x30,$offload
+	stvx		$E,$x40,$offload
+	stvx		$F,$x50,$offload
+	stvx		$G,$x60,$offload
+	stvx		$H,$x70,$offload
+	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
+	lvx		$Ki,$idx,$Tbl
+	addi		$idx,$idx,16
+___
+for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mtctr		r0
+	b		L16_xx
+.align	5
+L16_xx:
+___
+for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bdnz		L16_xx
+
+	lvx		@X[2],$x00,$offload
+	subic.		$num,$num,1
+	lvx		@X[3],$x10,$offload
+	vaddu${sz}m	$A,$A,@X[2]
+	lvx		@X[4],$x20,$offload
+	vaddu${sz}m	$B,$B,@X[3]
+	lvx		@X[5],$x30,$offload
+	vaddu${sz}m	$C,$C,@X[4]
+	lvx		@X[6],$x40,$offload
+	vaddu${sz}m	$D,$D,@X[5]
+	lvx		@X[7],$x50,$offload
+	vaddu${sz}m	$E,$E,@X[6]
+	lvx		@X[8],$x60,$offload
+	vaddu${sz}m	$F,$F,@X[7]
+	lvx		@X[9],$x70,$offload
+	vaddu${sz}m	$G,$G,@X[8]
+	vaddu${sz}m	$H,$H,@X[9]
+	bne		Loop
+___
+$code.=<<___		if ($SZ==4);
+	lvx		@X[0],$idx,$Tbl
+	addi		$idx,$idx,16
+	vperm		$A,$A,$B,$Ki		# pack the answer
+	lvx		@X[1],$idx,$Tbl
+	vperm		$E,$E,$F,$Ki
+	vperm		$A,$A,$C,@X[0]
+	vperm		$E,$E,$G,@X[0]
+	vperm		$A,$A,$D,@X[1]
+	vperm		$E,$E,$H,@X[1]
+	stvx_4w		$A,$x00,$ctx
+	stvx_4w		$E,$x10,$ctx
+___
+$code.=<<___		if ($SZ==8);
+	vperm		$A,$A,$B,$Ki		# pack the answer
+	vperm		$C,$C,$D,$Ki
+	vperm		$E,$E,$F,$Ki
+	vperm		$G,$G,$H,$Ki
+	stvx_u		$A,$x00,$ctx
+	stvx_u		$C,$x10,$ctx
+	stvx_u		$E,$x20,$ctx
+	stvx_u		$G,$x30,$ctx
+___
+$code.=<<___;
+	li		r10,`$FRAME+8*16+15`
+	mtlr		$lrsave
+	li		r11,`$FRAME+8*16+31`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,4,1,0x80,6,3,0
+	.long		0
+.size	$func,.-$func
+___
+
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align	6
+LPICmeup:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
+___
+
+if ($SZ==8) {
+    local *table = sub {
+	foreach(@_) { $code.=".quad	$_,$_\n"; }
+    };
+    table(
+	"0x428a2f98d728ae22","0x7137449123ef65cd",
+	"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
+	"0x3956c25bf348b538","0x59f111f1b605d019",
+	"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
+	"0xd807aa98a3030242","0x12835b0145706fbe",
+	"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
+	"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
+	"0x9bdc06a725c71235","0xc19bf174cf692694",
+	"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
+	"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
+	"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
+	"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
+	"0x983e5152ee66dfab","0xa831c66d2db43210",
+	"0xb00327c898fb213f","0xbf597fc7beef0ee4",
+	"0xc6e00bf33da88fc2","0xd5a79147930aa725",
+	"0x06ca6351e003826f","0x142929670a0e6e70",
+	"0x27b70a8546d22ffc","0x2e1b21385c26c926",
+	"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
+	"0x650a73548baf63de","0x766a0abb3c77b2a8",
+	"0x81c2c92e47edaee6","0x92722c851482353b",
+	"0xa2bfe8a14cf10364","0xa81a664bbc423001",
+	"0xc24b8b70d0f89791","0xc76c51a30654be30",
+	"0xd192e819d6ef5218","0xd69906245565a910",
+	"0xf40e35855771202a","0x106aa07032bbd1b8",
+	"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
+	"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
+	"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
+	"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
+	"0x748f82ee5defb2fc","0x78a5636f43172f60",
+	"0x84c87814a1f0ab72","0x8cc702081a6439ec",
+	"0x90befffa23631e28","0xa4506cebde82bde9",
+	"0xbef9a3f7b2c67915","0xc67178f2e372532b",
+	"0xca273eceea26619c","0xd186b8c721c0c207",
+	"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
+	"0x06f067aa72176fba","0x0a637dc5a2c898a6",
+	"0x113f9804bef90dae","0x1b710b35131c471b",
+	"0x28db77f523047d84","0x32caab7b40c72493",
+	"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
+	"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
+	"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
+$code.=<<___	if (!$LENDIAN);
+.quad	0x0001020304050607,0x1011121314151617
+___
+$code.=<<___	if ($LENDIAN);	# quad-swapped
+.quad	0x1011121314151617,0x0001020304050607
+___
+} else {
+    local *table = sub {
+	foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
+    };
+    table(
+	"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
+	"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
+	"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
+	"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
+	"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
+	"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
+	"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
+	"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
+	"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
+	"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
+	"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
+	"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
+	"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
+	"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
+	"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
+	"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
+$code.=<<___	if (!$LENDIAN);
+.long	0x00010203,0x10111213,0x10111213,0x10111213
+.long	0x00010203,0x04050607,0x10111213,0x10111213
+.long	0x00010203,0x04050607,0x08090a0b,0x10111213
+___
+$code.=<<___	if ($LENDIAN);	# word-swapped
+.long	0x10111213,0x10111213,0x10111213,0x00010203
+.long	0x10111213,0x10111213,0x04050607,0x00010203
+.long	0x10111213,0x08090a0b,0x04050607,0x00010203
+___
+}
+$code.=<<___;
+.asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/uid.c b/crypto/uid.c
index b1fd52bada..a1261731c9 100644
--- a/crypto/uid.c
+++ b/crypto/uid.c
@@ -65,7 +65,7 @@ int OPENSSL_issetugid(void)
 	return issetugid();
 	}
 
-#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE)
+#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE) || defined(_TMS320C6X)
 
 int OPENSSL_issetugid(void)
 	{
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index 6595ff35fc..e8eaef7582 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -119,10 +119,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&mov	("esi","edx");
 	&or	("ebp","ecx");		# merge AMD XOP flag
 
-	&bt	("ecx",26);		# check XSAVE bit
-	&jnc	(&label("done"));
 	&bt	("ecx",27);		# check OSXSAVE bit
-	&jnc	(&label("clear_xmm"));
+	&jnc	(&label("clear_avx"));
 	&xor	("ecx","ecx");
 	&data_byte(0x0f,0x01,0xd0);	# xgetbv
 	&and	("eax",6);
diff --git a/doc/crypto/SSLeay_version.pod b/doc/crypto/SSLeay_version.pod
deleted file mode 100644
index 1500c2af91..0000000000
--- a/doc/crypto/SSLeay_version.pod
+++ /dev/null
@@ -1,74 +0,0 @@
-=pod
-
-=head1 NAME
-
-SSLeay_version - retrieve version/build information about OpenSSL library
-
-=head1 SYNOPSIS
-
- #include <openssl/crypto.h>
-
- const char *SSLeay_version(int type);
-
-=head1 DESCRIPTION
-
-SSLeay_version() returns a pointer to a constant string describing the
-version of the OpenSSL library or giving information about the library
-build.
-
-The following B<type> values are supported:
-
-=over 4
-
-=item SSLEAY_VERSION
-
-The version of the OpenSSL library including the release date.
-
-=item SSLEAY_CFLAGS
-
-The compiler flags set for the compilation process in the form
-"compiler: ..."  if available or "compiler: information not available"
-otherwise.
-
-=item SSLEAY_BUILT_ON
-
-The date of the build process in the form "built on: ..." if available
-or "built on: date not available" otherwise.
-
-=item SSLEAY_PLATFORM
-
-The "Configure" target of the library build in the form "platform: ..."
-if available or "platform: information not available" otherwise.
-
-=item SSLEAY_DIR
-
-The "OPENSSLDIR" setting of the library build in the form "OPENSSLDIR: "...""
-if available or "OPENSSLDIR: N/A" otherwise.
-
-=back
-
-=head1 RETURN VALUES
-
-The following return values can occur:
-
-=over 4
-
-=item "not available"
-
-An invalid value for B<type> was given.
-
-=item Pointer to constant string
-
-Textual description.
-
-=back
-
-=head1 SEE ALSO
-
-L<crypto(3)|crypto(3)>
-
-=head1 HISTORY
-
-B<SSLEAY_DIR> was added in OpenSSL 0.9.7.
-
-=cut
diff --git a/e_os.h b/e_os.h
index 79c1392573..efe58fb97e 100644
--- a/e_os.h
+++ b/e_os.h
@@ -306,7 +306,7 @@ static unsigned int _strlen31(const char *str)
 #      undef isupper
 #      undef isxdigit
 #    endif
-#    if defined(_MSC_VER) && !defined(_DLL) && defined(stdin)
+#    if defined(_MSC_VER) && !defined(_WIN32_WCE) && !defined(_DLL) && defined(stdin)
 #      if _MSC_VER>=1300
 #        undef stdin
 #        undef stdout
@@ -332,8 +332,10 @@ static unsigned int _strlen31(const char *str)
 #      endif
 #    endif
 #  endif
-#  include <io.h>
-#  include <fcntl.h>
+#  if !defined(OPENSSL_FIPSCANISTER)
+#    include <io.h>
+#    include <fcntl.h>
+#  endif
 
 #  ifdef OPENSSL_SYS_WINCE
 #    define OPENSSL_NO_POSIX_IO
@@ -668,7 +670,7 @@ extern char *sys_errlist[]; extern int sys_nerr;
 #if defined(OPENSSL_SYS_WINDOWS)
 #  define strcasecmp _stricmp
 #  define strncasecmp _strnicmp
-#elif defined(OPENSSL_SYS_VMS)
+#elif defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_DSPBIOS)
 /* VMS below version 7.0 doesn't have strcasecmp() */
 #  include "o_str.h"
 #  define strcasecmp OPENSSL_strcasecmp
diff --git a/fips/aes/fips_aesavs.c b/fips/aes/fips_aesavs.c
index 84bcbac32a..cc3ed6afb1 100644
--- a/fips/aes/fips_aesavs.c
+++ b/fips/aes/fips_aesavs.c
@@ -99,7 +99,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
     {
     const EVP_CIPHER *cipher = NULL;
 
-    if (strcasecmp(amode, "CBC") == 0)
+    if (fips_strcasecmp(amode, "CBC") == 0)
 	{
 	switch (akeysz)
 		{
@@ -117,7 +117,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		}
 
 	}
-    else if (strcasecmp(amode, "ECB") == 0)
+    else if (fips_strcasecmp(amode, "ECB") == 0)
 	{
 	switch (akeysz)
 		{
@@ -134,7 +134,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if (strcasecmp(amode, "CFB128") == 0)
+    else if (fips_strcasecmp(amode, "CFB128") == 0)
 	{
 	switch (akeysz)
 		{
@@ -169,7 +169,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if(!strcasecmp(amode,"CFB1"))
+    else if(!fips_strcasecmp(amode,"CFB1"))
 	{
 	switch (akeysz)
 		{
@@ -186,7 +186,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if(!strcasecmp(amode,"CFB8"))
+    else if(!fips_strcasecmp(amode,"CFB8"))
 	{
 	switch (akeysz)
 		{
@@ -215,7 +215,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 	}
     if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0)
 	return 0;
-    if(!strcasecmp(amode,"CFB1"))
+    if(!fips_strcasecmp(amode,"CFB1"))
 	M_EVP_CIPHER_CTX_set_flags(ctx, EVP_CIPH_FLAG_LENGTH_BITS);
     if (dir)
 		FIPS_cipher(ctx, ciphertext, plaintext, len);
@@ -535,7 +535,7 @@ static int do_mct(char *amode,
 		}
 	    }
 	}
-    
+    FIPS_cipher_ctx_cleanup(&ctx);
     return ret;
     }
 
@@ -554,7 +554,7 @@ static int proc_file(char *rqfile, char *rspfile)
     FILE *afp = NULL, *rfp = NULL;
     char ibuf[2048];
     char tbuf[2048];
-    int ilen, len, ret = 0;
+    int len;
     char algo[8] = "";
     char amode[8] = "";
     char atest[8] = "";
@@ -605,7 +605,6 @@ static int proc_file(char *rqfile, char *rspfile)
     while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL)
 	{
 	tidy_line(tbuf, ibuf);
-	ilen = strlen(ibuf);
 	/*      printf("step=%d ibuf=%s",step,ibuf); */
 	switch (step)
 	    {
@@ -636,10 +635,8 @@ static int proc_file(char *rqfile, char *rspfile)
 		char *xp, *pp = ibuf+2;
 		int n;
 		if (akeysz)
-		    { /* insert current time & date */
-		    time_t rtim = time(0);
-		    fputs("# ", rfp);
-		    copy_line(ctime(&rtim), rfp);
+		    {
+		    copy_line(ibuf, rfp);
 		    }
 		else
 		    {
@@ -780,11 +777,11 @@ static int proc_file(char *rqfile, char *rspfile)
 		    if(do_mct(amode, akeysz, aKey, iVec, 
 			      dir, (unsigned char*)plaintext, len, 
 			      rfp) < 0)
-			EXIT(1);
+			err = 1;
 		    }
 		else
 		    {
-		    ret = AESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    AESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("CIPHERTEXT",ciphertext,len,rfp,
@@ -822,7 +819,7 @@ static int proc_file(char *rqfile, char *rspfile)
 		    }
 		else
 		    {
-		    ret = AESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    AESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp,
@@ -850,6 +847,7 @@ static int proc_file(char *rqfile, char *rspfile)
 	fclose(rfp);
     if (afp)
 	fclose(afp);
+    FIPS_cipher_ctx_cleanup(&ctx);
     return err;
     }
 
@@ -862,23 +860,26 @@ static int proc_file(char *rqfile, char *rspfile)
     aes_test -d xxxxx.xxx
   The default is: -d req.txt
 --------------------------------------------------*/
+#ifdef FIPS_ALGVS
+int fips_aesavs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
     {
     char *rqlist = "req.txt", *rspfile = NULL;
     FILE *fp = NULL;
     char fn[250] = "", rfn[256] = "";
-    int f_opt = 0, d_opt = 1;
+    int d_opt = 1;
     fips_algtest_init();
 
     if (argc > 1)
 	{
-	if (strcasecmp(argv[1], "-d") == 0)
+	if (fips_strcasecmp(argv[1], "-d") == 0)
 	    {
 	    d_opt = 1;
 	    }
-	else if (strcasecmp(argv[1], "-f") == 0)
+	else if (fips_strcasecmp(argv[1], "-f") == 0)
 	    {
-	    f_opt = 1;
 	    d_opt = 0;
 	    }
 	else
@@ -915,7 +916,7 @@ int main(int argc, char **argv)
 	    if (proc_file(rfn, rspfile))
 		{
 		printf(">>> Processing failed for: %s <<<\n", rfn);
-		EXIT(1);
+		return 1;
 		}
 	    }
 	fclose(fp);
@@ -929,7 +930,6 @@ int main(int argc, char **argv)
 	    printf(">>> Processing failed for: %s <<<\n", fn);
 	    }
 	}
-    EXIT(0);
     return 0;
     }
 
diff --git a/fips/aes/fips_gcmtest.c b/fips/aes/fips_gcmtest.c
index 3839de8f8a..30e4bcc0f4 100644
--- a/fips/aes/fips_gcmtest.c
+++ b/fips/aes/fips_gcmtest.c
@@ -75,10 +75,11 @@ int main(int argc, char **argv)
 
 #include "fips_utl.h"
 
+static char buf[204800];
+static char lbuf[204800];
+
 static void gcmtest(FILE *in, FILE *out, int encrypt)
 	{
-	char buf[2048];
-	char lbuf[2048];
 	char *keyword, *value;
 	int keylen = -1, ivlen = -1, aadlen = -1, taglen = -1, ptlen = -1;
 	int rv;
@@ -261,16 +262,14 @@ static void gcmtest(FILE *in, FILE *out, int encrypt)
 			iv = aad = ct = pt = key = tag = NULL;
 			}
 		}
+	FIPS_cipher_ctx_cleanup(&ctx);	
 	}
 
 static void xtstest(FILE *in, FILE *out)
 	{
-	char buf[204800];
-	char lbuf[204800];
 	char *keyword, *value;
 	int inlen = 0;
 	int encrypt = 0;
-	int rv;
 	long l;
 	unsigned char *key = NULL, *iv = NULL;
 	unsigned char *inbuf = NULL, *outbuf = NULL;
@@ -326,7 +325,7 @@ static void xtstest(FILE *in, FILE *out)
 			{
 			FIPS_cipherinit(&ctx, xts, key, iv, encrypt);
 			outbuf = OPENSSL_malloc(inlen);
-			rv = FIPS_cipher(&ctx, outbuf, inbuf, inlen);
+			FIPS_cipher(&ctx, outbuf, inbuf, inlen);
 			OutputValue(encrypt ? "CT":"PT", outbuf, inlen, out, 0);
 			OPENSSL_free(inbuf);
 			OPENSSL_free(outbuf);
@@ -335,12 +334,11 @@ static void xtstest(FILE *in, FILE *out)
 			iv = key = inbuf = outbuf = NULL;
 			}	
 		}
+	FIPS_cipher_ctx_cleanup(&ctx);	
 	}
 
 static void ccmtest(FILE *in, FILE *out)
 	{
-	char buf[200048];
-	char lbuf[200048];
 	char *keyword, *value;
 	long l;
 	unsigned char *Key = NULL, *Nonce = NULL;
@@ -428,6 +426,8 @@ static void ccmtest(FILE *in, FILE *out)
 			}
 		else if (!strcmp(keyword,"Adata"))
 			{
+			if (Adata)
+				OPENSSL_free(Adata);
 			Adata = hex2bin_m(value, &l);
 			if (Alen && l != Alen)
 				{
@@ -493,10 +493,16 @@ static void ccmtest(FILE *in, FILE *out)
 		OPENSSL_free(Key);
 	if (Nonce)
 		OPENSSL_free(Nonce);
+	if (Adata)
+		OPENSSL_free(Adata);
 	FIPS_cipher_ctx_cleanup(&ctx);
 	}
 
-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_gcmtest_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	int encrypt;
 	int xts = 0, ccm = 0;
diff --git a/fips/cmac/fips_cmactest.c b/fips/cmac/fips_cmactest.c
index 6d799f2d5f..2c8c7664e9 100644
--- a/fips/cmac/fips_cmactest.c
+++ b/fips/cmac/fips_cmactest.c
@@ -92,7 +92,11 @@ static int print_cmac_ver(const EVP_CIPHER *cipher, FILE *out,
 		unsigned char *Mac, int Maclen,
 		int Tlen);
 
+#ifdef FIPS_ALGVS
+int fips_cmactest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 	int mode = 0;		/* 0 => Generate, 1 => Verify */
diff --git a/fips/des/fips_des_selftest.c b/fips/des/fips_des_selftest.c
index a014f6f33f..fdf1eb6945 100644
--- a/fips/des/fips_des_selftest.c
+++ b/fips/des/fips_des_selftest.c
@@ -83,7 +83,7 @@ static const struct
 
 int FIPS_selftest_des()
     {
-    int n, ret = 0;
+    int n, ret = 1;
     EVP_CIPHER_CTX ctx;
     FIPS_cipher_ctx_init(&ctx);
 
@@ -93,10 +93,8 @@ int FIPS_selftest_des()
 	if (!fips_cipher_test(FIPS_TEST_CIPHER, &ctx, EVP_des_ede3_ecb(),
 				tests3[n].key, NULL,
 				tests3[n].plaintext, tests3[n].ciphertext, 8))
-		goto err;
+		ret = 0;
 	}
-    ret = 1;
-    err:
     FIPS_cipher_ctx_cleanup(&ctx);
     if (ret == 0)
 	    FIPSerr(FIPS_F_FIPS_SELFTEST_DES,FIPS_R_SELFTEST_FAILED);
diff --git a/fips/des/fips_desmovs.c b/fips/des/fips_desmovs.c
index e8766561ce..0ffab89e2f 100644
--- a/fips/des/fips_desmovs.c
+++ b/fips/des/fips_desmovs.c
@@ -102,7 +102,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx,
     if (akeysz != 192)
 	{
 	printf("Invalid key size: %d\n", akeysz);
-	EXIT(1);
+	return 0;
 	}
 
     if (fips_strcasecmp(amode, "CBC") == 0)
@@ -120,7 +120,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx,
     else
 	{
 	printf("Unknown mode: %s\n", amode);
-	EXIT(1);
+	return 0;
 	}
 
     if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0)
@@ -155,12 +155,12 @@ static void shiftin(unsigned char *dst,unsigned char *src,int nbits)
     }	
 
 /*-----------------------------------------------*/
-char *t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"};
-char *t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"};
-enum Mode {CBC, ECB, OFB, CFB1, CFB8, CFB64};
+char *tdes_t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"};
+char *tdes_t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"};
+enum tdes_Mode {TCBC, TECB, TOFB, TCFB1, TCFB8, TCFB64};
 int Sizes[6]={64,64,64,1,8,64};
 
-static void do_mct(char *amode, 
+static int do_tmct(char *amode, 
 	    int akeysz, int numkeys, unsigned char *akey,unsigned char *ivec,
 	    int dir, unsigned char *text, int len,
 	    FILE *rfp)
@@ -170,12 +170,12 @@ static void do_mct(char *amode,
     unsigned char text0[8];
 
     for (imode=0 ; imode < 6 ; ++imode)
-	if(!strcmp(amode,t_mode[imode]))
+	if(!strcmp(amode,tdes_t_mode[imode]))
 	    break;
     if (imode == 6)
 	{ 
 	printf("Unrecognized mode: %s\n", amode);
-	EXIT(1);
+	return 0;
 	}
     for(i=0 ; i < 400 ; ++i)
 	{
@@ -196,12 +196,12 @@ static void do_mct(char *amode,
 		OutputValue("",akey+n*8,8,rfp,0);
 		}
 
-	if(imode != ECB)
+	if(imode != TECB)
 	    OutputValue("IV",ivec,8,rfp,0);
-	OutputValue(t_tag[dir^1],text,len,rfp,imode == CFB1);
+	OutputValue(tdes_t_tag[dir^1],text,len,rfp,imode == TCFB1);
 #if 0
 	/* compensate for endianness */
-	if(imode == CFB1)
+	if(imode == TCFB1)
 	    text[0]<<=7;
 #endif
 	memcpy(text0,text,8);
@@ -223,18 +223,18 @@ static void do_mct(char *amode,
 		}
 	    if(j == 9999)
 		{
-		OutputValue(t_tag[dir],text,len,rfp,imode == CFB1);
+		OutputValue(tdes_t_tag[dir],text,len,rfp,imode == TCFB1);
 		/*		memcpy(ivec,text,8); */
 		}
 	    /*	    DebugValue("iv",ctx.iv,8); */
 	    /* accumulate material for the next key */
 	    shiftin(nk,text,Sizes[imode]);
 	    /*	    DebugValue("nk",nk,24);*/
-	    if((dir && (imode == CFB1 || imode == CFB8 || imode == CFB64
-			|| imode == CBC)) || imode == OFB)
+	    if((dir && (imode == TCFB1 || imode == TCFB8
+			|| imode == TCFB64 || imode == TCBC)) || imode == TOFB)
 		memcpy(text,old_iv,8);
 
-	    if(!dir && (imode == CFB1 || imode == CFB8 || imode == CFB64))
+	    if(!dir && (imode == TCFB1 || imode == TCFB8 || imode == TCFB64))
 		{
 		/* the test specifies using the output of the raw DES operation
 		   which we don't have, so reconstruct it... */
@@ -260,18 +260,20 @@ static void do_mct(char *amode,
 	/* pointless exercise - the final text doesn't depend on the
 	   initial text in OFB mode, so who cares what it is? (Who
 	   designed these tests?) */
-	if(imode == OFB)
+	if(imode == TOFB)
 	    for(n=0 ; n < 8 ; ++n)
 		text[n]=text0[n]^old_iv[n];
+	FIPS_cipher_ctx_cleanup(&ctx);
 	}
+    return 1;
     }
     
-static int proc_file(char *rqfile, char *rspfile)
+static int tproc_file(char *rqfile, char *rspfile)
     {
     char afn[256], rfn[256];
     FILE *afp = NULL, *rfp = NULL;
     char ibuf[2048], tbuf[2048];
-    int ilen, len, ret = 0;
+    int len;
     char amode[8] = "";
     char atest[100] = "";
     int akeysz=0;
@@ -322,7 +324,6 @@ static int proc_file(char *rqfile, char *rspfile)
     while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL)
 	{
 	tidy_line(tbuf, ibuf);
-	ilen = strlen(ibuf);
 	/*	printf("step=%d ibuf=%s",step,ibuf);*/
 	if(step == 3 && !strcmp(amode,"ECB"))
 	    {
@@ -355,10 +356,8 @@ static int proc_file(char *rqfile, char *rspfile)
 		char *xp, *pp = ibuf+2;
 		int n;
 		if(*amode)
-		    { /* insert current time & date */
-		    time_t rtim = time(0);
-		    fputs("# ", rfp);
-		    copy_line(ctime(&rtim), rfp);
+		    {
+		    copy_line(ibuf, rfp);
 		    }
 		else
 		    {
@@ -546,12 +545,14 @@ static int proc_file(char *rqfile, char *rspfile)
 		PrintValue("PLAINTEXT", (unsigned char*)plaintext, len);
 		if (strcmp(atest, "Monte") == 0)  /* Monte Carlo Test */
 		    {
-		    do_mct(amode,akeysz,numkeys,aKey,iVec,dir,plaintext,len,rfp);
+		    if (!do_tmct(amode,akeysz,numkeys,aKey,iVec,
+					dir,plaintext,len,rfp))
+			return -1;
 		    }
 		else
 		    {
 		    assert(dir == 1);
-		    ret = DESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    DESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  ciphertext, plaintext, len);
 		    OutputValue("CIPHERTEXT",ciphertext,len,rfp,
@@ -585,13 +586,13 @@ static int proc_file(char *rqfile, char *rspfile)
 		PrintValue("CIPHERTEXT", ciphertext, len);
 		if (strcmp(atest, "Monte") == 0)  /* Monte Carlo Test */
 		    {
-		    do_mct(amode, akeysz, numkeys, aKey, iVec, 
+		    do_tmct(amode, akeysz, numkeys, aKey, iVec, 
 			   dir, ciphertext, len, rfp);
 		    }
 		else
 		    {
 		    assert(dir == 0);
-		    ret = DESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    DESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp,
@@ -619,6 +620,7 @@ static int proc_file(char *rqfile, char *rspfile)
 	fclose(rfp);
     if (afp)
 	fclose(afp);
+    FIPS_cipher_ctx_cleanup(&ctx);
     return err;
     }
 
@@ -631,12 +633,16 @@ static int proc_file(char *rqfile, char *rspfile)
     aes_test -d xxxxx.xxx
   The default is: -d req.txt
 --------------------------------------------------*/
+#ifdef FIPS_ALGVS
+int fips_desmovs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
     {
     char *rqlist = "req.txt", *rspfile = NULL;
     FILE *fp = NULL;
     char fn[250] = "", rfn[256] = "";
-    int f_opt = 0, d_opt = 1;
+    int d_opt = 1;
 
     fips_algtest_init();
     if (argc > 1)
@@ -647,7 +653,6 @@ int main(int argc, char **argv)
 	    }
 	else if (fips_strcasecmp(argv[1], "-f") == 0)
 	    {
-	    f_opt = 1;
 	    d_opt = 0;
 	    }
 	else
@@ -680,10 +685,10 @@ int main(int argc, char **argv)
 	    strtok(fn, "\r\n");
 	    strcpy(rfn, fn);
 	    printf("Processing: %s\n", rfn);
-	    if (proc_file(rfn, rspfile))
+	    if (tproc_file(rfn, rspfile))
 		{
 		printf(">>> Processing failed for: %s <<<\n", rfn);
-		EXIT(1);
+		return -1;
 		}
 	    }
 	fclose(fp);
@@ -692,12 +697,11 @@ int main(int argc, char **argv)
 	{
 	if (VERBOSE)
 		printf("Processing: %s\n", fn);
-	if (proc_file(fn, rspfile))
+	if (tproc_file(fn, rspfile))
 	    {
 	    printf(">>> Processing failed for: %s <<<\n", fn);
 	    }
 	}
-    EXIT(0);
     return 0;
     }
 
diff --git a/fips/dh/fips_dhvs.c b/fips/dh/fips_dhvs.c
index ad760c8aaa..a925e13c7d 100644
--- a/fips/dh/fips_dhvs.c
+++ b/fips/dh/fips_dhvs.c
@@ -145,8 +145,12 @@ static void output_Zhash(FILE *out, int exout,
 	OPENSSL_cleanse(Z, Zlen);
 	OPENSSL_free(Z);
 	}
-		
-int main(int argc,char **argv)
+
+#ifdef FIPS_ALGVS
+int fips_dhvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	char **args = argv + 1;
 	int argn = argc - 1;
@@ -275,10 +279,14 @@ int main(int argc,char **argv)
 							rhash, rhashlen);
 			}
 		}
+	if (in && in != stdin)
+		fclose(in);
+	if (out && out != stdout)
+		fclose(out);
 	return 0;
 	parse_error:
 	fprintf(stderr, "Error Parsing request file\n");
-	exit(1);
+	return 1;
 	}
 
 #endif
diff --git a/fips/dsa/fips_dsa_sign.c b/fips/dsa/fips_dsa_sign.c
index ea1bd87303..274bcd9016 100644
--- a/fips/dsa/fips_dsa_sign.c
+++ b/fips/dsa/fips_dsa_sign.c
@@ -114,4 +114,28 @@ int FIPS_dsa_verify_digest(DSA *dsa,
 	return dsa->meth->dsa_do_verify(dig,dlen,s,dsa);
 	}
 
+int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, DSA_SIG *s)
+	{
+	int ret=-1;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	ret=FIPS_dsa_verify_digest(dsa, dig, dlen, s);
+	OPENSSL_cleanse(dig, dlen);
+	return ret;
+	}
+
+DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash)
+	{
+	DSA_SIG *s;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	s = FIPS_dsa_sign_digest(dsa, dig, dlen);
+	OPENSSL_cleanse(dig, dlen);
+	return s;
+	}
+
 #endif
diff --git a/fips/dsa/fips_dsatest.c b/fips/dsa/fips_dsatest.c
index 64d52258eb..3ea600e4ab 100644
--- a/fips/dsa/fips_dsatest.c
+++ b/fips/dsa/fips_dsatest.c
@@ -62,8 +62,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#ifndef NO_SYS_TYPES_H
 #include <sys/types.h>
 #include <sys/stat.h>
+#endif
 
 #include "e_os.h"
 
@@ -154,9 +156,7 @@ int main(int argc, char **argv)
 	unsigned char buf[256];
 	unsigned long h;
 	BN_GENCB cb;
-	EVP_MD_CTX mctx;
 	BN_GENCB_set(&cb, dsa_cb, stderr);
-	FIPS_md_ctx_init(&mctx);
 
     	fips_algtest_init();
 
@@ -210,19 +210,11 @@ int main(int argc, char **argv)
 		}
 	DSA_generate_key(dsa);
 
-	if (!FIPS_digestinit(&mctx, EVP_sha1()))
-		goto end;
-	if (!FIPS_digestupdate(&mctx, str1, 20))
-		goto end;
-	sig = FIPS_dsa_sign_ctx(dsa, &mctx);
+	sig = FIPS_dsa_sign(dsa, str1, 20, EVP_sha1());
 	if (!sig)
 		goto end;
 
-	if (!FIPS_digestinit(&mctx, EVP_sha1()))
-		goto end;
-	if (!FIPS_digestupdate(&mctx, str1, 20))
-		goto end;
-	if (FIPS_dsa_verify_ctx(dsa, &mctx, sig) != 1)
+	if (FIPS_dsa_verify(dsa, str1, 20, EVP_sha1(), sig) != 1)
 		goto end;
 
 	ret = 1;
@@ -231,7 +223,6 @@ end:
 	if (sig)
 		FIPS_dsa_sig_free(sig);
 	if (dsa != NULL) FIPS_dsa_free(dsa);
-	FIPS_md_ctx_cleanup(&mctx);
 #if 0
 	CRYPTO_mem_leaks(bio_err);
 #endif
diff --git a/fips/dsa/fips_dssvs.c b/fips/dsa/fips_dssvs.c
index 45bca7c155..bd7055d463 100644
--- a/fips/dsa/fips_dssvs.c
+++ b/fips/dsa/fips_dssvs.c
@@ -46,7 +46,8 @@ static int parse_mod(char *line, int *pdsa2, int *pL, int *pN,
 	if (strcmp(keyword, "L"))
 		return 0;
 	*pL = atoi(value);
-	strcpy(line, p + 1);
+	strcpy(lbuf, p + 1);
+        strcpy(line, lbuf);
 	if (pmd)
 		p = strchr(line, ',');
 	else
@@ -199,6 +200,7 @@ static void pqg(FILE *in, FILE *out)
 			{
 			fprintf(out, "counter = %d" RESP_EOL RESP_EOL, counter);
 			}
+		FIPS_dsa_free(dsa);
 		}
 	    }
 	else if(!strcmp(keyword,"P"))
@@ -519,6 +521,8 @@ static void keyver(FILE *in, FILE *out)
 	    BN_free(g);
 	if (Y2)
 	    BN_free(Y2);
+	if (ctx)
+	    BN_CTX_free(ctx);
     }
 
 static void keypair(FILE *in, FILE *out)
@@ -549,6 +553,11 @@ static void keypair(FILE *in, FILE *out)
 	    int n=atoi(value);
 
 	    dsa = FIPS_dsa_new();
+	    if (!dsa)
+		{
+		fprintf(stderr, "DSA allocation error\n");
+		exit(1);
+		}
 	    if (!dsa2 && !dsa_builtin_paramgen(dsa, L, N, NULL, NULL, 0,
 						NULL, NULL, NULL, NULL))
 			{
@@ -575,6 +584,7 @@ static void keypair(FILE *in, FILE *out)
 		do_bn_print_name(out, "Y",dsa->pub_key);
 	    	fputs(RESP_EOL, out);
 		}
+	    FIPS_dsa_free(dsa);
 	    }
 	}
     }
@@ -627,9 +637,7 @@ static void siggen(FILE *in, FILE *out)
 	    {
 	    unsigned char msg[1024];
 	    int n;
-	    EVP_MD_CTX mctx;
 	    DSA_SIG *sig;
-	    FIPS_md_ctx_init(&mctx);
 
 	    n=hex2bin(value,msg);
 
@@ -637,19 +645,16 @@ static void siggen(FILE *in, FILE *out)
 		exit(1);
 	    do_bn_print_name(out, "Y",dsa->pub_key);
 
-	    FIPS_digestinit(&mctx, md);
-	    FIPS_digestupdate(&mctx, msg, n);
-	    sig = FIPS_dsa_sign_ctx(dsa, &mctx);
+	    sig = FIPS_dsa_sign(dsa, msg, n, md);
 
 	    do_bn_print_name(out, "R",sig->r);
 	    do_bn_print_name(out, "S",sig->s);
 	    fputs(RESP_EOL, out);
 	    FIPS_dsa_sig_free(sig);
-	    FIPS_md_ctx_cleanup(&mctx);
 	    }
 	}
-	if (dsa)
-		FIPS_dsa_free(dsa);
+    if (dsa)
+	FIPS_dsa_free(dsa);
     }
 
 static void sigver(FILE *in, FILE *out)
@@ -687,37 +692,48 @@ static void sigver(FILE *in, FILE *out)
 	    dsa = FIPS_dsa_new();
 	    }
 	else if(!strcmp(keyword,"P"))
-	    dsa->p=hex2bn(value);
+	    do_hex2bn(&dsa->p, value);
 	else if(!strcmp(keyword,"Q"))
-	    dsa->q=hex2bn(value);
+	    do_hex2bn(&dsa->q, value);
 	else if(!strcmp(keyword,"G"))
-	    dsa->g=hex2bn(value);
+	    do_hex2bn(&dsa->g, value);
 	else if(!strcmp(keyword,"Msg"))
 	    n=hex2bin(value,msg);
 	else if(!strcmp(keyword,"Y"))
-	    dsa->pub_key=hex2bn(value);
+	    do_hex2bn(&dsa->pub_key, value);
 	else if(!strcmp(keyword,"R"))
 	    sig->r=hex2bn(value);
 	else if(!strcmp(keyword,"S"))
 	    {
-	    EVP_MD_CTX mctx;
 	    int r;
-	    FIPS_md_ctx_init(&mctx);
 	    sig->s=hex2bn(value);
 
-	    FIPS_digestinit(&mctx, md);
-	    FIPS_digestupdate(&mctx, msg, n);
 	    no_err = 1;
-	    r = FIPS_dsa_verify_ctx(dsa, &mctx, sig);
+	    r = FIPS_dsa_verify(dsa, msg, n, md, sig);
 	    no_err = 0;
-	    FIPS_md_ctx_cleanup(&mctx);
+	    if (sig->s)
+		{
+		BN_free(sig->s);
+		sig->s = NULL;
+		}
+	    if (sig->r)
+		{
+		BN_free(sig->r);
+		sig->r = NULL;
+		}
 	
 	    fprintf(out, "Result = %c" RESP_EOL RESP_EOL, r == 1 ? 'P' : 'F');
 	    }
 	}
+	if (dsa)
+	    FIPS_dsa_free(dsa);
     }
 
-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_dssvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
     {
     FILE *in, *out;
     if (argc == 4)
diff --git a/fips/ecdh/fips_ecdh_selftest.c b/fips/ecdh/fips_ecdh_selftest.c
index 2b21ceaf48..0b16c57aae 100644
--- a/fips/ecdh/fips_ecdh_selftest.c
+++ b/fips/ecdh/fips_ecdh_selftest.c
@@ -166,6 +166,7 @@ int FIPS_selftest_ecdh(void)
 			rv = -1;
 			goto err;
 			}
+		EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH);
 
 		if (!EC_KEY_set_public_key_affine_coordinates(ec1, x, y))
 			{
@@ -194,6 +195,7 @@ int FIPS_selftest_ecdh(void)
 			rv = -1;
 			goto err;
 			}
+		EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH);
 
 		if (!EC_KEY_set_public_key_affine_coordinates(ec2, x, y))
 			{
diff --git a/fips/ecdh/fips_ecdhvs.c b/fips/ecdh/fips_ecdhvs.c
index 72ebe815dd..a1422868b3 100644
--- a/fips/ecdh/fips_ecdhvs.c
+++ b/fips/ecdh/fips_ecdhvs.c
@@ -76,7 +76,7 @@ int main(int argc, char **argv)
 
 #include "fips_utl.h"
 
-static const EVP_MD *parse_md(char *line)
+static const EVP_MD *eparse_md(char *line)
 	{
 	char *p;
 	if (line[0] != '[' || line[1] != 'E')
@@ -261,6 +261,7 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group,
 	unsigned char chash[EVP_MAX_MD_SIZE];
 	int Zlen;
 	ec = EC_KEY_new();
+	EC_KEY_set_flags(ec, EC_FLAG_COFACTOR_ECDH);
 	EC_KEY_set_group(ec, group);
 	peerkey = make_peer(group, cx, cy);
 	if (rhash == NULL)
@@ -301,7 +302,11 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group,
 	EC_POINT_free(peerkey);
 	}
 		
-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_ecdhvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	char **args = argv + 1;
 	int argn = argc - 1;
@@ -315,6 +320,7 @@ int main(int argc,char **argv)
 	EC_GROUP *group = NULL;
 	char *keyword = NULL, *value = NULL;
 	int do_verify = -1, exout = 0;
+	int rv = 1;
 
 	int curve_nids[5] = {0,0,0,0,0};
 	int param_set = -1;
@@ -408,11 +414,16 @@ int main(int argc,char **argv)
 			if (group)
 				EC_GROUP_free(group);
 			group = EC_GROUP_new_by_curve_name(nid);
+			if (!group)
+				{
+				fprintf(stderr, "ERROR: unsupported curve %s\n", buf + 1);
+				return 1;
+				}
 			}
 
 		if (strlen(buf) > 6 && !strncmp(buf, "[E", 2))
 			{
-			md = parse_md(buf);
+			md = eparse_md(buf);
 			if (md == NULL)
 				goto parse_error;
 			continue;
@@ -459,10 +470,27 @@ int main(int argc,char **argv)
 					md, rhash, rhashlen);
 			}
 		}
-	return 0;
+	rv = 0;
 	parse_error:
-	fprintf(stderr, "Error Parsing request file\n");
-	exit(1);
+	if (id)
+		BN_free(id);
+	if (ix)
+		BN_free(ix);
+	if (iy)
+		BN_free(iy);
+	if (cx)
+		BN_free(cx);
+	if (cy)
+		BN_free(cy);
+	if (group)
+		EC_GROUP_free(group);
+	if (in && in != stdin)
+		fclose(in);
+	if (out && out != stdout)
+		fclose(out);
+	if (rv)
+		fprintf(stderr, "Error Parsing request file\n");
+	return rv;
 	}
 
 #endif
diff --git a/fips/ecdsa/fips_ecdsa_selftest.c b/fips/ecdsa/fips_ecdsa_selftest.c
index 7d1007e19d..6ceb1c37b8 100644
--- a/fips/ecdsa/fips_ecdsa_selftest.c
+++ b/fips/ecdsa/fips_ecdsa_selftest.c
@@ -143,7 +143,7 @@ int FIPS_selftest_ecdsa()
 	EC_KEY *ec = NULL;
 	BIGNUM *x = NULL, *y = NULL, *d = NULL;
 	EVP_PKEY pk;
-	int rv = 0;
+	int rv = 0, test_err = 0;
 	size_t i;
 
 	for (i = 0; i < sizeof(test_ec_data)/sizeof(EC_SELFTEST_DATA); i++)
@@ -173,12 +173,12 @@ int FIPS_selftest_ecdsa()
 		if (!fips_pkey_signature_test(FIPS_TEST_SIGNATURE, &pk, NULL, 0,
 						NULL, 0, EVP_sha512(), 0,
 						ecd->name))
-			goto err;
+			test_err = 1;
 		EC_KEY_free(ec);
 		ec = NULL;
 		}
-
-	rv = 1;
+	if (test_err == 0)
+		rv = 1;
 
 	err:
 
diff --git a/fips/ecdsa/fips_ecdsa_sign.c b/fips/ecdsa/fips_ecdsa_sign.c
index 0e86a50ef4..a7839ee592 100644
--- a/fips/ecdsa/fips_ecdsa_sign.c
+++ b/fips/ecdsa/fips_ecdsa_sign.c
@@ -87,3 +87,28 @@ int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s)
 	return ret;
 	}
 
+int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, ECDSA_SIG *s)
+	{
+	int ret=-1;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	ret=FIPS_ecdsa_verify_digest(key, dig, dlen, s);
+	OPENSSL_cleanse(dig, dlen);
+	return ret;
+	}
+
+ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key,
+			const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash)
+	{
+	ECDSA_SIG *s;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	s = FIPS_ecdsa_sign_digest(key, dig, dlen);
+	OPENSSL_cleanse(dig, dlen);
+	return s;
+	}
+
diff --git a/fips/ecdsa/fips_ecdsavs.c b/fips/ecdsa/fips_ecdsavs.c
index 898951a2c8..5745a6d37a 100644
--- a/fips/ecdsa/fips_ecdsavs.c
+++ b/fips/ecdsa/fips_ecdsavs.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv)
 #include <openssl/objects.h>
 
 
-static int lookup_curve(char *in, char *curve_name, const EVP_MD **pmd)
+static int elookup_curve(char *in, char *curve_name, const EVP_MD **pmd)
 	{
 	char *cname, *p;
 	/* Copy buffer as we will change it */
@@ -200,7 +200,7 @@ static int KeyPair(FILE *in, FILE *out)
 		if (*buf == '[' && buf[2] == '-')
 			{
 			if (buf[2] == '-')
-			curve_nid = lookup_curve(buf, lbuf, NULL);
+			curve_nid = elookup_curve(buf, lbuf, NULL);
 			fputs(buf, out);
 			continue;
 			}
@@ -260,7 +260,7 @@ static int PKV(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[' && buf[2] == '-')
 			{
-			curve_nid = lookup_curve(buf, lbuf, NULL);
+			curve_nid = elookup_curve(buf, lbuf, NULL);
 			if (curve_nid == NID_undef)
 				return 0;
 				
@@ -287,10 +287,13 @@ static int PKV(FILE *in, FILE *out)
 			no_err = 1;
 			rv = EC_KEY_set_public_key_affine_coordinates(key, Qx, Qy);
 			no_err = 0;
+			EC_KEY_free(key);
 			fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F");
 			}
 
 		}
+	BN_free(Qx);
+	BN_free(Qy);
 	return 1;
 	}
 
@@ -305,8 +308,6 @@ static int SigGen(FILE *in, FILE *out)
 	EC_KEY *key = NULL;
 	ECDSA_SIG *sig = NULL;
 	const EVP_MD *digest = NULL;
-	EVP_MD_CTX mctx;
-	EVP_MD_CTX_init(&mctx);
 	Qx = BN_new();
 	Qy = BN_new();
 	while(fgets(buf, sizeof buf, in) != NULL)
@@ -314,7 +315,7 @@ static int SigGen(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[')
 			{
-			curve_nid = lookup_curve(buf, lbuf, &digest);
+			curve_nid = elookup_curve(buf, lbuf, &digest);
 			if (curve_nid == NID_undef)
 				return 0;
 			}
@@ -342,9 +343,7 @@ static int SigGen(FILE *in, FILE *out)
 				return 0;
 				}
 
-			FIPS_digestinit(&mctx, digest);
-			FIPS_digestupdate(&mctx, msg, mlen);
-	    		sig = FIPS_ecdsa_sign_ctx(key, &mctx);
+	    		sig = FIPS_ecdsa_sign(key, msg, mlen, digest);
 
 			if (!sig)
 				{
@@ -358,7 +357,7 @@ static int SigGen(FILE *in, FILE *out)
 			do_bn_print_name(out, "S", sig->s);
 
 			EC_KEY_free(key);
-
+			OPENSSL_free(msg);
 			FIPS_ecdsa_sig_free(sig);
 
 			}
@@ -366,7 +365,6 @@ static int SigGen(FILE *in, FILE *out)
 		}
 	BN_free(Qx);
 	BN_free(Qy);
-	FIPS_md_ctx_cleanup(&mctx);
 	return 1;
 	}
 
@@ -381,8 +379,6 @@ static int SigVer(FILE *in, FILE *out)
 	EC_KEY *key = NULL;
 	ECDSA_SIG sg, *sig = &sg;
 	const EVP_MD *digest = NULL;
-	EVP_MD_CTX mctx;
-	EVP_MD_CTX_init(&mctx);
 	sig->r = NULL;
 	sig->s = NULL;
 	while(fgets(buf, sizeof buf, in) != NULL)
@@ -390,7 +386,7 @@ static int SigVer(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[')
 			{
-			curve_nid = lookup_curve(buf, lbuf, &digest);
+			curve_nid = elookup_curve(buf, lbuf, &digest);
 			if (curve_nid == NID_undef)
 				return 0;
 			}
@@ -447,20 +443,32 @@ static int SigVer(FILE *in, FILE *out)
 				return 0;
 				}
 
-			FIPS_digestinit(&mctx, digest);
-			FIPS_digestupdate(&mctx, msg, mlen);
 			no_err = 1;
-	    		rv = FIPS_ecdsa_verify_ctx(key, &mctx, sig);
+	    		rv = FIPS_ecdsa_verify(key, msg, mlen, digest, sig);
+			EC_KEY_free(key);
+			if (msg)
+				OPENSSL_free(msg);
 			no_err = 0;
 
 			fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F");
 			}
 
 		}
+	if (sig->r)
+		BN_free(sig->r);
+	if (sig->s)
+		BN_free(sig->s);
+	if (Qx)
+		BN_free(Qx);
+	if (Qy)
+		BN_free(Qy);
 	return 1;
 	}
-
+#ifdef FIPS_ALGVS
+int fips_ecdsavs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 	const char *cmd = argv[1];
diff --git a/fips/fips.c b/fips/fips.c
index 36ac8d1b0c..0269609a7e 100644
--- a/fips/fips.c
+++ b/fips/fips.c
@@ -81,7 +81,7 @@ static int fips_started = 0;
 static int fips_is_owning_thread(void);
 static int fips_set_owning_thread(void);
 static int fips_clear_owning_thread(void);
-static unsigned char *fips_signature_witness(void);
+static const unsigned char *fips_signature_witness(void);
 
 #define fips_w_lock()	CRYPTO_w_lock(CRYPTO_LOCK_FIPS)
 #define fips_w_unlock()	CRYPTO_w_unlock(CRYPTO_LOCK_FIPS)
@@ -148,7 +148,10 @@ void fips_set_selftest_fail(void)
 
 extern const void         *FIPS_text_start(),  *FIPS_text_end();
 extern const unsigned char FIPS_rodata_start[], FIPS_rodata_end[];
-unsigned char              FIPS_signature [20] = { 0 };
+#ifdef _TMS320C6X
+const
+#endif
+unsigned char              FIPS_signature [20] = { 0, 0xff };
 __fips_constseg
 static const char          FIPS_hmac_key[]="etaonrishdlcupfm";
 
@@ -413,9 +416,8 @@ int fips_clear_owning_thread(void)
 	return ret;
 	}
 
-unsigned char *fips_signature_witness(void)
+const unsigned char *fips_signature_witness(void)
 	{
-	extern unsigned char FIPS_signature[];
 	return FIPS_signature;
 	}
 
diff --git a/fips/fips.h b/fips/fips.h
index 4cadbd26fd..b6263575c3 100644
--- a/fips/fips.h
+++ b/fips/fips.h
@@ -97,9 +97,8 @@ int FIPS_selftest_rsa(void);
 int FIPS_selftest_dsa(void);
 int FIPS_selftest_ecdsa(void);
 int FIPS_selftest_ecdh(void);
-void FIPS_corrupt_drbg(void);
-void FIPS_x931_stick(void);
-void FIPS_drbg_stick(void);
+void FIPS_x931_stick(int onoff);
+void FIPS_drbg_stick(int onoff);
 int FIPS_selftest_x931(void);
 int FIPS_selftest_hmac(void);
 int FIPS_selftest_drbg(void);
@@ -224,6 +223,16 @@ int FIPS_rsa_verify_digest(struct rsa_st *rsa,
 			const struct env_md_st *mgf1Hash,
 			const unsigned char *sigbuf, unsigned int siglen);
 
+int FIPS_rsa_sign(struct rsa_st *rsa, const unsigned char *msg, int msglen,
+			const struct env_md_st *mhash, int rsa_pad_mode,
+			int saltlen, const struct env_md_st *mgf1Hash,
+			unsigned char *sigret, unsigned int *siglen);
+
+int FIPS_rsa_verify(struct rsa_st *rsa, const unsigned char *msg, int msglen,
+			const struct env_md_st *mhash, int rsa_pad_mode,
+			int saltlen, const struct env_md_st *mgf1Hash,
+			const unsigned char *sigbuf, unsigned int siglen);
+
 #ifdef OPENSSL_FIPSCAPABLE
 
 int FIPS_digestinit(EVP_MD_CTX *ctx, const EVP_MD *type);
diff --git a/fips/fips_canister.c b/fips/fips_canister.c
index 7d67d32d6c..daf53cb40d 100644
--- a/fips/fips_canister.c
+++ b/fips/fips_canister.c
@@ -29,11 +29,15 @@ const void         *FIPS_text_end(void);
 
 #if !defined(FIPS_REF_POINT_IS_CROSS_COMPILER_AWARE)
 # if	(defined(__ANDROID__) && (defined(__arm__) || defined(__arm)	|| \
+				  defined(__aarch64__)			|| \
 				  defined(__i386__)|| defined(__i386)))	|| \
 	(defined(__vxworks)   && (defined(__ppc__) || defined(__ppc)	|| \
 				  defined(__mips__)|| defined(__mips)))	|| \
+        (defined(__NetBSD__)  && (defined(__powerpc__) || defined(__i386))) || \
 	(defined(__linux)     && ((defined(__PPC__) && !defined(__PPC64__)) || \
 				  defined(__arm__) || defined(__arm)))	|| \
+	(defined(__APPLE__) /* verified on all MacOS X & iOS flavors */)|| \
+	(defined(_TMS320C6X))						|| \
 	(defined(_WIN32)      && defined(_MSC_VER))
 #  define FIPS_REF_POINT_IS_CROSS_COMPILER_AWARE
 # endif
@@ -69,6 +73,10 @@ const unsigned int FIPS_text_startX[]=
 #  pragma const_seg("fipsro$a")
 #  pragma const_seg()
    __declspec(allocate("fipsro$a"))
+# elif defined(_TMS320C6X)
+#  pragma CODE_SECTION(instruction_pointer,".fips_text:start")
+#  pragma CODE_SECTION(FIPS_ref_point,".fips_text:start")
+#  pragma DATA_SECTION(FIPS_rodata_start,".fips_const:start")
 # endif
 const unsigned int FIPS_rodata_start[]=
 	{ 0x46495053, 0x5f726f64, 0x6174615f, 0x73746172 };
@@ -86,6 +94,10 @@ const unsigned int FIPS_text_endX[]=
 #  pragma const_seg("fipsro$z")
 #  pragma const_seg()
    __declspec(allocate("fipsro$z"))
+# elif defined(_TMS320C6X)
+#  pragma CODE_SECTION(instruction_pointer,".fips_text:end")
+#  pragma CODE_SECTION(FIPS_ref_point,".fips_text:end")
+#  pragma DATA_SECTION(FIPS_rodata_end,".fips_const:end")
 # endif
 const unsigned int FIPS_rodata_end[]=
 	{ 0x46495053, 0x5f726f64, 0x6174615f, 0x656e645b };
diff --git a/fips/fips_locl.h b/fips/fips_locl.h
index df3863f91e..6efa93194e 100644
--- a/fips/fips_locl.h
+++ b/fips/fips_locl.h
@@ -67,8 +67,8 @@ int fips_post_failed(int id, int subid, void *ex);
 int fips_post_corrupt(int id, int subid, void *ex);
 int fips_post_status(void);
 
-#define FIPS_MODULE_VERSION_NUMBER	0x20000000L
-#define FIPS_MODULE_VERSION_TEXT	"FIPS 2.0-dev unvalidated test module xx XXX xxxx"
+#define FIPS_MODULE_VERSION_NUMBER	0x20000009L
+#define FIPS_MODULE_VERSION_TEXT	"FIPS 2.0-rc9 unvalidated test module xx XXX xxxx"
 
 #ifdef  __cplusplus
 }
diff --git a/fips/fips_post.c b/fips/fips_post.c
index e55ec08407..8cd2334362 100644
--- a/fips/fips_post.c
+++ b/fips/fips_post.c
@@ -207,7 +207,6 @@ int fips_pkey_signature_test(int id, EVP_PKEY *pkey,
 			const char *fail_str)
 	{	
 	int subid;
-	void *ex = NULL;
 	int ret = 0;
 	unsigned char *sig = NULL;
 	unsigned int siglen;
@@ -335,7 +334,7 @@ int fips_pkey_signature_test(int id, EVP_PKEY *pkey,
 		FIPSerr(FIPS_F_FIPS_PKEY_SIGNATURE_TEST,FIPS_R_TEST_FAILURE);
 		if (fail_str)
 			FIPS_add_error_data(2, "Type=", fail_str);
-		fips_post_failed(id, subid, ex);
+		fips_post_failed(id, subid, pkey);
 		return 0;
 		}
 	return fips_post_success(id, subid, pkey);
diff --git a/fips/fips_premain.c b/fips/fips_premain.c
index a7c8b78f8f..b6ec32db4e 100644
--- a/fips/fips_premain.c
+++ b/fips/fips_premain.c
@@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#if defined(__unix) || defined(__unix__) || defined(__vxworks) || defined(__ANDROID__)
+#if defined(__unix) || defined(__unix__) || defined(__vxworks) || defined(__ANDROID__) || defined(__APPLE__)
 #include <unistd.h>
 #endif
 
@@ -53,6 +53,12 @@
   int lib$initialize();
   globaldef int (*lib_init_ref)() = lib$initialize;
 # pragma __standard
+#elif defined(_TMS320C6X)
+# if defined(__TI_EABI__)
+  asm("\t.sect \".init_array\"\n\t.align 4\n\t.field FINGERPRINT_premain,32");
+# else
+  asm("\t.sect \".pinit\"\n\t.align 4\n\t.field _FINGERPRINT_premain,32");
+# endif
 #elif 0
   The rest has to be taken care of through command line:
 
@@ -134,6 +140,9 @@ void FINGERPRINT_premain(void)
 	}
 #endif
     } while(0);
+#if defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)
+    fips_openssl_cpuid_setup();
+#endif
 }
 
 #else
diff --git a/fips/fips_premain.c.sha1 b/fips/fips_premain.c.sha1
index e0332e8afd..19c30807a7 100644
--- a/fips/fips_premain.c.sha1
+++ b/fips/fips_premain.c.sha1
@@ -1 +1 @@
-HMAC-SHA1(fips_premain.c)= a401afd9c2b57f0f11d2b34b6d0c9815b1fe6a66
+HMAC-SHA1(fips_premain.c)= 2bfb57ef540bdba29220a45d65e1b4080de9adc1
diff --git a/fips/fips_test_suite.c b/fips/fips_test_suite.c
index b7aea4e9cd..cd4aafbd12 100644
--- a/fips/fips_test_suite.c
+++ b/fips/fips_test_suite.c
@@ -40,12 +40,46 @@ int main(int argc, char *argv[])
 
 #include <openssl/rsa.h>
 #include <openssl/dsa.h>
+#include <openssl/ecdsa.h>
 #include <openssl/dh.h>
 
 #include <openssl/fips.h>
 #include <openssl/fips_rand.h>
 #include "fips_utl.h"
 
+static int verbose = 0;
+
+static int fips_module_mode_set_verbose(int mode, const char *pass)
+	{
+	int rv;
+	if (verbose)
+		printf("Attempting to %s FIPS mode\n", mode ? "Enter" : "Leave");
+	rv = FIPS_module_mode_set(mode, pass);
+	if (verbose)
+		printf("FIPS_module_mode() returned %d\n", FIPS_module_mode());
+	return rv;
+	}
+
+static void do_print_rsa_key(RSA *rsa)
+	{
+    	if (!verbose)
+		return;
+	do_bn_print_name(stdout, "RSA key modulus value", rsa->e);
+	do_bn_print_name(stdout, "RSA key publicExponent value", rsa->n);
+	do_bn_print_name(stdout, "RSA key pricateExponent value", rsa->d);
+	do_bn_print_name(stdout, "RSA key prime1 value", rsa->p);
+	do_bn_print_name(stdout, "RSA key prime2 value", rsa->q);
+	do_bn_print_name(stdout, "RSA key exponent1 value", rsa->dmp1);
+	do_bn_print_name(stdout, "RSA key exponent2 value", rsa->dmq1);
+	do_bn_print_name(stdout, "RSA key coefficient value", rsa->iqmp);
+	}
+
+static void do_print_buf(char *name, unsigned char *buf, int buflen)
+	{
+	if (verbose)
+		OutputValue(name, buf, buflen, stdout, 0);
+	}
+
 /* AES: encrypt and decrypt known plaintext, verify result matches original plaintext
 */
 static int FIPS_aes_test(void)
@@ -57,14 +91,30 @@ static int FIPS_aes_test(void)
 	unsigned char plaintext[16] = "etaonrishdlcu";
 	EVP_CIPHER_CTX ctx;
 	FIPS_cipher_ctx_init(&ctx);
+	if (verbose)
+		{
+		do_print_buf("Key", key, sizeof(key));
+		do_print_buf("Plaintext", plaintext, sizeof(plaintext));
+		}
 	if (FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 1) <= 0)
 		goto err;
 	FIPS_cipher(&ctx, citmp, plaintext, 16);
+	if (verbose)
+		{
+		do_print_buf("Ciphertext", citmp, sizeof(plaintext));
+		printf("AES 128 bit ECB mode decryption started\n");
+		}
 	if (FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 0) <= 0)
 		goto err;
 	FIPS_cipher(&ctx, pltmp, citmp, 16);
+	do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext));
 	if (memcmp(pltmp, plaintext, 16))
+		{
+		printf("Comparison failure!!\n");
 		goto err;
+		}
+	if (verbose)
+		printf("Comparison success.\n");
 	ret = 1;
 	err:
 	FIPS_cipher_ctx_cleanup(&ctx);
@@ -83,6 +133,13 @@ static int FIPS_aes_gcm_test(void)
 	unsigned char plaintext[16] = "etaonrishdlcu";
 	EVP_CIPHER_CTX ctx;
 	FIPS_cipher_ctx_init(&ctx);
+	if (verbose)
+		{
+		do_print_buf("Key", key, sizeof(key));
+		do_print_buf("IV", key, sizeof(iv));
+		do_print_buf("Plaintext", plaintext, sizeof(plaintext));
+		do_print_buf("AAD", aad, sizeof(aad));
+		}
 	if (FIPS_cipherinit(&ctx, EVP_aes_128_gcm(), key, iv, 1) <= 0)
 		goto err;
 	FIPS_cipher(&ctx, NULL, aad, sizeof(aad));
@@ -91,6 +148,12 @@ static int FIPS_aes_gcm_test(void)
 	if (!FIPS_cipher_ctx_ctrl(&ctx, EVP_CTRL_GCM_GET_TAG, 16, tagtmp))
 		goto err;
 
+	if (verbose)
+		{
+		do_print_buf("Ciphertext", citmp, sizeof(citmp));
+		do_print_buf("Tag", tagtmp, sizeof(tagtmp));
+		}
+
 	if (FIPS_cipherinit(&ctx, EVP_aes_128_gcm(), key, iv, 0) <= 0)
 		goto err;
 	if (!FIPS_cipher_ctx_ctrl(&ctx, EVP_CTRL_GCM_SET_TAG, 16, tagtmp))
@@ -103,8 +166,17 @@ static int FIPS_aes_gcm_test(void)
 	if (FIPS_cipher(&ctx, NULL, NULL, 0) < 0)
 		goto err;
 
+	if (verbose)
+		do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext));
+
 	if (memcmp(pltmp, plaintext, 16))
+		{
+		if (verbose)
+			printf("Comparison failure!!\n");
 		goto err;
+		}
+
+	printf("Comparison sucess.\n");
 
 	ret = 1;
 	err:
@@ -122,20 +194,110 @@ static int FIPS_des3_test(void)
     	unsigned char plaintext[] = { 'e', 't', 'a', 'o', 'n', 'r', 'i', 's' };
 	EVP_CIPHER_CTX ctx;
 	FIPS_cipher_ctx_init(&ctx);
+	if (verbose)
+		{
+		do_print_buf("Key", key, sizeof(key));
+		do_print_buf("Plaintext", plaintext, sizeof(plaintext));
+		}
 	if (FIPS_cipherinit(&ctx, EVP_des_ede3_ecb(), key, NULL, 1) <= 0)
 		goto err;
 	FIPS_cipher(&ctx, citmp, plaintext, 8);
+	if (verbose)
+		{
+		do_print_buf("Ciphertext", citmp, sizeof(plaintext));
+		printf("DES3 ECB mode decryption\n");
+		}
 	if (FIPS_cipherinit(&ctx, EVP_des_ede3_ecb(), key, NULL, 0) <= 0)
 		goto err;
 	FIPS_cipher(&ctx, pltmp, citmp, 8);
+	if (verbose)
+		do_print_buf("Recovered Plaintext", pltmp, sizeof(plaintext));
 	if (memcmp(pltmp, plaintext, 8))
+		{
+		if (verbose)
+			printf("Comparison failure!!\n");
+
 		goto err;
+		}
+	if (verbose)
+		printf("Comparison success\n");
 	ret = 1;
 	err:
 	FIPS_cipher_ctx_cleanup(&ctx);
 	return ret;
 	}
 
+/*
+ * ECDSA: generate keys and sign, verify input plaintext.
+ */
+static int FIPS_ecdsa_test(void)
+    {
+    EC_KEY *ec = NULL;
+    unsigned char dgst[] = "etaonrishdlc";
+    int r = 0;
+    ECDSA_SIG *sig = NULL;
+
+    ERR_clear_error();
+    ec = FIPS_ec_key_new_by_curve_name(NID_X9_62_prime256v1);
+    if (!ec)
+	goto end;
+    if (!FIPS_ec_key_generate_key(ec))
+	goto end;
+
+    if (verbose)
+	{
+	BIGNUM *Qx, *Qy;
+	BN_CTX *ctx;
+	const EC_GROUP *grp;
+	const EC_POINT *pt;
+	const BIGNUM *priv;
+	Qx = BN_new();
+	Qy = BN_new();
+	ctx = BN_CTX_new();
+	grp = EC_KEY_get0_group(ec);
+	pt = EC_KEY_get0_public_key(ec);
+	priv = EC_KEY_get0_private_key(ec);
+	printf("EC Key using P-256\n");
+	if (!EC_POINT_get_affine_coordinates_GFp(grp, pt, Qx, Qy, ctx))
+		goto end;
+	
+	do_bn_print_name(stdout, "ECDSA key x coordinate", Qx);
+	do_bn_print_name(stdout, "ECDSA key y coordinate", Qy);
+	do_bn_print_name(stdout, "ECDSA key private value", priv);
+	BN_free(Qx);
+	BN_free(Qy);
+	BN_CTX_free(ctx);
+	printf("Signing string \"%s\" using SHA256\n", dgst);
+	}
+
+    sig = FIPS_ecdsa_sign(ec, dgst, sizeof(dgst) -1, EVP_sha256());
+    if (!sig)
+	{
+	if (verbose)
+		printf("Signing Failed!!\n");
+	goto end;
+	}
+
+    if (verbose)
+	{
+	printf("Signing successful\n");
+	do_bn_print_name(stdout, "ECDSA signature r value", sig->r);
+	do_bn_print_name(stdout, "ECDSA signature s value", sig->s);
+	}
+
+    r = FIPS_ecdsa_verify(ec, dgst, sizeof(dgst) -1, EVP_sha256(), sig);
+    if (verbose)
+	printf("ECDSA verification %s\n", r ? "Successful." : "Failed!!");
+    end:
+    if (sig)
+	FIPS_ecdsa_sig_free(sig);
+    if (ec)
+  	  FIPS_ec_key_free(ec);
+    if (r != 1)
+	return 0;
+    return 1;
+    }
+
 /*
  * DSA: generate keys and sign, verify input plaintext.
  */
@@ -144,11 +306,9 @@ static int FIPS_dsa_test(int bad)
     DSA *dsa = NULL;
     unsigned char dgst[] = "etaonrishdlc";
     int r = 0;
-    EVP_MD_CTX mctx;
     DSA_SIG *sig = NULL;
 
     ERR_clear_error();
-    FIPS_md_ctx_init(&mctx);
     dsa = FIPS_dsa_new();
     if (!dsa)
 	goto end;
@@ -159,23 +319,37 @@ static int FIPS_dsa_test(int bad)
     if (bad)
 	    BN_add_word(dsa->pub_key, 1);
 
-    if (!FIPS_digestinit(&mctx, EVP_sha256()))
-	goto end;
-    if (!FIPS_digestupdate(&mctx, dgst, sizeof(dgst) - 1))
-	goto end;
-    sig = FIPS_dsa_sign_ctx(dsa, &mctx);
-    if (!sig)
-	goto end;
+    if (verbose)
+	{
+	do_bn_print_name(stdout, "DSA key p value", dsa->p);
+	do_bn_print_name(stdout, "DSA key q value", dsa->q);
+	do_bn_print_name(stdout, "DSA key g value", dsa->g);
+	do_bn_print_name(stdout, "DSA key public_key value", dsa->pub_key);
+	do_bn_print_name(stdout, "DSA key private key value", dsa->priv_key);
+	printf("Signing string \"%s\" using SHA256\n", dgst);
+	}
 
-    if (!FIPS_digestinit(&mctx, EVP_sha256()))
+    sig = FIPS_dsa_sign(dsa, dgst, sizeof(dgst) -1, EVP_sha256());
+    if (!sig)
+	{
+	if (verbose)
+		printf("Signing Failed!!\n");
 	goto end;
-    if (!FIPS_digestupdate(&mctx, dgst, sizeof(dgst) - 1))
-	goto end;
-    r = FIPS_dsa_verify_ctx(dsa, &mctx, sig);
+	}
+
+    if (verbose)
+	{
+	printf("Signing successful\n");
+	do_bn_print_name(stdout, "DSA signature r value", sig->r);
+	do_bn_print_name(stdout, "DSA signature s value", sig->s);
+	}
+
+    r = FIPS_dsa_verify(dsa, dgst, sizeof(dgst) -1, EVP_sha256(), sig);
+    if (verbose)
+	printf("DSA verification %s\n", r ? "Successful." : "Failed!!");
     end:
     if (sig)
 	FIPS_dsa_sig_free(sig);
-    FIPS_md_ctx_cleanup(&mctx);
     if (dsa)
   	  FIPS_dsa_free(dsa);
     if (r != 1)
@@ -193,11 +367,9 @@ static int FIPS_rsa_test(int bad)
     unsigned char buf[256];
     unsigned int slen;
     BIGNUM *bn;
-    EVP_MD_CTX mctx;
     int r = 0;
 
     ERR_clear_error();
-    FIPS_md_ctx_init(&mctx);
     key = FIPS_rsa_new();
     bn = BN_new();
     if (!key || !bn)
@@ -209,20 +381,31 @@ static int FIPS_rsa_test(int bad)
     if (bad)
 	    BN_add_word(key->n, 1);
 
-    if (!FIPS_digestinit(&mctx, EVP_sha256()))
-	goto end;
-    if (!FIPS_digestupdate(&mctx, input_ptext, sizeof(input_ptext) - 1))
-	goto end;
-    if (!FIPS_rsa_sign_ctx(key, &mctx, RSA_PKCS1_PADDING, 0, NULL, buf, &slen))
-	goto end;
+    if (verbose)
+	{
+    	do_print_rsa_key(key);
+	printf("Signing string \"%s\" using SHA256\n", input_ptext);
+	}
 
-    if (!FIPS_digestinit(&mctx, EVP_sha256()))
+    if (!FIPS_rsa_sign(key, input_ptext, sizeof(input_ptext) - 1, EVP_sha256(),
+			RSA_PKCS1_PADDING, 0, NULL, buf, &slen))
+	{
+	if (verbose)
+		printf("RSA Signing failed!!\n");
 	goto end;
-    if (!FIPS_digestupdate(&mctx, input_ptext, sizeof(input_ptext) - 1))
-	goto end;
-    r = FIPS_rsa_verify_ctx(key, &mctx, RSA_PKCS1_PADDING, 0, NULL, buf, slen);
+	}
+
+    if (verbose)
+	{
+	printf("RSA signing successul\n");
+	do_print_buf("RSA signature", buf, slen);
+	}
+
+    r = FIPS_rsa_verify(key, input_ptext, sizeof(input_ptext) - 1, EVP_sha256(),
+			RSA_PKCS1_PADDING, 0, NULL, buf, slen);
+    if (verbose)
+	printf("RSA Verification %s\n", r == 1 ? "Successful" : "Failed!!");
     end:
-    FIPS_md_ctx_cleanup(&mctx);
     if (key)
   	  FIPS_rsa_free(key);
     if (r != 1)
@@ -243,6 +426,11 @@ static int FIPS_sha1_test()
 
     ERR_clear_error();
     if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha1())) return 0;
+    if (verbose)
+	{
+	printf("Digesting string %s\n", str);
+	do_print_buf("Digest value", md, sizeof(md));
+	}
     if (memcmp(md,digest,sizeof(md)))
         return 0;
     return 1;
@@ -262,6 +450,11 @@ static int FIPS_sha256_test()
 
     ERR_clear_error();
     if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha256())) return 0;
+    if (verbose)
+	{
+	printf("Digesting string %s\n", str);
+	do_print_buf("Digest value", md, sizeof(md));
+	}
     if (memcmp(md,digest,sizeof(md)))
         return 0;
     return 1;
@@ -283,6 +476,11 @@ static int FIPS_sha512_test()
 
     ERR_clear_error();
     if (!FIPS_digest(str,sizeof(str) - 1,md, NULL, EVP_sha512())) return 0;
+    if (verbose)
+	{
+	printf("Digesting string %s\n", str);
+	do_print_buf("Digest value", md, sizeof(md));
+	}
     if (memcmp(md,digest,sizeof(md)))
         return 0;
     return 1;
@@ -304,8 +502,19 @@ static int FIPS_hmac_sha1_test()
 
     ERR_clear_error();
     if (!HMAC(EVP_sha1(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0;
+    if (verbose)
+	{
+	do_print_buf("HMAC key", key, sizeof(key) -1);
+	do_print_buf("HMAC input", iv, sizeof(iv) -1);
+	do_print_buf("HMAC output", out, outlen);
+	}
     if (memcmp(out,kaval,outlen))
+	{
+	if (verbose)
+		printf("HMAC comparison failed!!\n");
         return 0;
+	}
+    printf("HMAC comparison successful.\n");
     return 1;
     }
 
@@ -325,6 +534,19 @@ static int FIPS_hmac_sha224_test()
 
     ERR_clear_error();
     if (!HMAC(EVP_sha224(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0;
+    if (verbose)
+	{
+	do_print_buf("HMAC key", key, sizeof(key) -1);
+	do_print_buf("HMAC input", iv, sizeof(iv) -1);
+	do_print_buf("HMAC output", out, outlen);
+	}
+    if (memcmp(out,kaval,outlen))
+	{
+	if (verbose)
+		printf("HMAC comparison failed!!\n");
+        return 0;
+	}
+    printf("HMAC comparison successful.\n");
     if (memcmp(out,kaval,outlen))
         return 0;
     return 1;
@@ -346,8 +568,19 @@ static int FIPS_hmac_sha256_test()
 
     ERR_clear_error();
     if (!HMAC(EVP_sha256(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0;
+    if (verbose)
+	{
+	do_print_buf("HMAC key", key, sizeof(key) -1);
+	do_print_buf("HMAC input", iv, sizeof(iv) -1);
+	do_print_buf("HMAC output", out, outlen);
+	}
     if (memcmp(out,kaval,outlen))
+	{
+	if (verbose)
+		printf("HMAC comparison failed!!\n");
         return 0;
+	}
+    printf("HMAC comparison successful.\n");
     return 1;
     }
 
@@ -368,8 +601,19 @@ static int FIPS_hmac_sha384_test()
 
     ERR_clear_error();
     if (!HMAC(EVP_sha384(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0;
+    if (verbose)
+	{
+	do_print_buf("HMAC key", key, sizeof(key) -1);
+	do_print_buf("HMAC input", iv, sizeof(iv) -1);
+	do_print_buf("HMAC output", out, outlen);
+	}
     if (memcmp(out,kaval,outlen))
+	{
+	if (verbose)
+		printf("HMAC comparison failed!!\n");
         return 0;
+	}
+    printf("HMAC comparison successful.\n");
     return 1;
     }
 
@@ -391,8 +635,19 @@ static int FIPS_hmac_sha512_test()
 
     ERR_clear_error();
     if (!HMAC(EVP_sha512(),key,sizeof(key)-1,iv,sizeof(iv)-1,out,&outlen)) return 0;
+    if (verbose)
+	{
+	do_print_buf("HMAC key", key, sizeof(key) -1);
+	do_print_buf("HMAC input", iv, sizeof(iv) -1);
+	do_print_buf("HMAC output", out, outlen);
+	}
     if (memcmp(out,kaval,outlen))
+	{
+	if (verbose)
+		printf("HMAC comparison failed!!\n");
         return 0;
+	}
+    printf("HMAC comparison successful.\n");
     return 1;
     }
 
@@ -427,18 +682,15 @@ static int FIPS_cmac_aes128_test()
     out = OPENSSL_malloc(outlen);
     if (!CMAC_Final(ctx, out, &outlen))
 	    goto end;
-#if 0
-    {
-    char *hexout = OPENSSL_malloc(outlen * 2 + 1);
-    bin2hex(out, outlen, hexout);
-    printf("CMAC-AES128: res = %s\n", hexout);
-    OPENSSL_free(hexout);
-    }
-    r = 1;
-#else
     if (!memcmp(out,kaval,outlen))
 	    r = 1;
-#endif
+    if (verbose)
+	{
+	do_print_buf("CMAC key", key, sizeof(key));
+	do_print_buf("CMAC input", data, sizeof(data) -1);
+	do_print_buf("CMAC output", out, outlen);
+	printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!");
+	}
     end:
     CMAC_CTX_free(ctx);
     if (out)
@@ -478,18 +730,15 @@ static int FIPS_cmac_aes192_test()
     out = OPENSSL_malloc(outlen);
     if (!CMAC_Final(ctx, out, &outlen))
 	    goto end;
-#if 0
-    {
-    char *hexout = OPENSSL_malloc(outlen * 2 + 1);
-    bin2hex(out, outlen, hexout);
-    printf("CMAC-AES192: res = %s\n", hexout);
-    OPENSSL_free(hexout);
-    }
-    r = 1;
-#else
     if (!memcmp(out,kaval,outlen))
 	    r = 1;
-#endif
+    if (verbose)
+	{
+	do_print_buf("CMAC key", key, sizeof(key));
+	do_print_buf("CMAC input", data, sizeof(data) -1);
+	do_print_buf("CMAC output", out, outlen);
+	printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!");
+	}
     end:
     CMAC_CTX_free(ctx);
     if (out)
@@ -530,18 +779,15 @@ static int FIPS_cmac_aes256_test()
     out = OPENSSL_malloc(outlen);
     if (!CMAC_Final(ctx, out, &outlen))
 	    goto end;
-#if 0
-    {
-    char *hexout = OPENSSL_malloc(outlen * 2 + 1);
-    bin2hex(out, outlen, hexout);
-    printf("CMAC-AES256: res = %s\n", hexout);
-    OPENSSL_free(hexout);
-    }
-    r = 1;
-#else
     if (!memcmp(out,kaval,outlen))
 	    r = 1;
-#endif
+    if (verbose)
+	{
+	do_print_buf("CMAC key", key, sizeof(key));
+	do_print_buf("CMAC input", data, sizeof(data) -1);
+	do_print_buf("CMAC output", out, outlen);
+	printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!");
+	}
     end:
     CMAC_CTX_free(ctx);
     if (out)
@@ -580,18 +826,15 @@ static int FIPS_cmac_tdea3_test()
     out = OPENSSL_malloc(outlen);
     if (!CMAC_Final(ctx, out, &outlen))
 	    goto end;
-#if 0
-    {
-    char *hexout = OPENSSL_malloc(outlen * 2 + 1);
-    bin2hex(out, outlen, hexout);
-    printf("CMAC-TDEA3: res = %s\n", hexout);
-    OPENSSL_free(hexout);
-    }
-    r = 1;
-#else
     if (!memcmp(out,kaval,outlen))
 	    r = 1;
-#endif
+    if (verbose)
+	{
+	do_print_buf("CMAC key", key, sizeof(key));
+	do_print_buf("CMAC input", data, sizeof(data) -1);
+	do_print_buf("CMAC output", out, outlen);
+	printf("CMAC comparison %s\n", r == 1 ? "successful." : "Failed!!");
+	}
     end:
     CMAC_CTX_free(ctx);
     if (out)
@@ -647,9 +890,15 @@ static int Zeroize()
     for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]);
         printf("\n");
     RAND_bytes(userkey, sizeof userkey);
-    printf("\tchar buffer key after overwriting: \n\t\t");
+    printf("\tchar buffer key after overwriting with random key: \n\t\t");
     for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]);
         printf("\n");
+    OPENSSL_cleanse(userkey, sizeof(userkey));
+    printf("\tchar buffer key after zeroization: \n\t\t");
+    for(i = 0; i < sizeof(userkey); i++) printf("%02x", userkey[i]);
+        printf("\n");
+
+    FIPS_rsa_free(key);
 
     return 1;
     }
@@ -668,6 +917,13 @@ static size_t drbg_test_cb(DRBG_CTX *ctx, unsigned char **pout,
 	return (min_len + 0xf) & ~0xf;
 	}
 
+/* Callback which returns 0 to indicate entropy source failure */
+static size_t drbg_fail_cb(DRBG_CTX *ctx, unsigned char **pout,
+                                int entropy, size_t min_len, size_t max_len)
+	{
+	return 0;
+	}
+
 /* DRBG test: just generate lots of data and trigger health checks */
 
 static int do_drbg_test(int type, int flags)
@@ -696,7 +952,7 @@ static int do_drbg_test(int type, int flags)
 	}
     rv = 1;
     err:
-    FIPS_drbg_uninstantiate(dctx);
+    FIPS_drbg_free(dctx);
     return rv;
     }
 
@@ -758,9 +1014,13 @@ static const char * Fail(const char *msg)
     return msg; 
     }
 
-static void test_msg(const char *msg, int result)
-	{
-	printf("%s...%s\n", msg, result ? "successful" : Fail("Failed!"));
+#define test_msg(msg, rtest) \
+	{ \
+	int rv; \
+	if (verbose) \
+		printf("%s...started\n", msg); \
+	rv = rtest; \
+	printf("%s...%s\n", msg, rv ? "successful" : Fail("Failed!")); \
 	}
 
 /* Table of IDs for POST translating between NIDs and names */
@@ -821,12 +1081,17 @@ static const char *lookup_id(int id)
 static int fail_id = -1;
 static int fail_sub = -1;
 static int fail_key = -1;
+static int sub_num = -1, sub_count = -1;
+static int sub_fail_num = -1;
+
+static int st_err, post_quiet = 0;
 
 static int post_cb(int op, int id, int subid, void *ex)
 	{
 	const char *idstr, *exstr = "";
-	char asctmp[20];
+	char asctmp[20], teststr[80];
 	int keytype = -1;
+	int exp_fail = 0;
 #ifdef FIPS_POST_TIME
 	static struct timespec start, end, tstart, tend;
 #endif
@@ -938,6 +1203,21 @@ static int post_cb(int op, int id, int subid, void *ex)
 
 		}
 
+	if (fail_id == id
+		&& (fail_key == -1 || fail_key == keytype)
+		&& (fail_sub == -1 || fail_sub == subid))
+			exp_fail = 1;
+
+	if (sub_num > 0)
+		{
+		if (sub_fail_num == sub_num)
+			exp_fail = 1;
+		sprintf(teststr, "\t\t%s %s (POST subtest #%d) test",
+						idstr, exstr, sub_num);
+		}
+	else
+		sprintf(teststr, "\t\t%s %s test", idstr, exstr);
+
 	switch(op)
 		{
 		case FIPS_POST_BEGIN:
@@ -948,9 +1228,16 @@ static int post_cb(int op, int id, int subid, void *ex)
 		clock_gettime(CLOCK_REALTIME, &tstart);
 #endif
 		printf("\tPOST started\n");
+		sub_num = 1;
 		break;
 
 		case FIPS_POST_END:
+		if (sub_count == -1)
+			sub_count = sub_num;
+		else if (sub_num != sub_count)
+			printf("Inconsistent POST count %d != %d\n",
+							sub_num, sub_count);
+		sub_num = -1;
 		printf("\tPOST %s\n", id ? "Success" : "Failed");
 #ifdef FIPS_POST_TIME
 		clock_gettime(CLOCK_REALTIME, &tend);
@@ -961,14 +1248,23 @@ static int post_cb(int op, int id, int subid, void *ex)
 		break;
 
 		case FIPS_POST_STARTED:
-		printf("\t\t%s %s test started\n", idstr, exstr);
+		if (!post_quiet && !exp_fail)
+			printf("%s started\n", teststr);
 #ifdef FIPS_POST_TIME
 		clock_gettime(CLOCK_REALTIME, &start);
 #endif
 		break;
 
 		case FIPS_POST_SUCCESS:
-		printf("\t\t%s %s test OK\n", idstr, exstr);
+		if (sub_num > 0)
+			sub_num++;
+		if (exp_fail)
+			{
+			printf("%s OK but should've failed\n", teststr);
+			st_err++;
+			}
+		else if (!post_quiet)
+			printf("%s OK\n", teststr);
 #ifdef FIPS_POST_TIME
 		clock_gettime(CLOCK_REALTIME, &end);
 		printf("\t\t\tTook %f seconds\n",
@@ -978,15 +1274,21 @@ static int post_cb(int op, int id, int subid, void *ex)
 		break;
 
 		case FIPS_POST_FAIL:
-		printf("\t\t%s %s test FAILED!!\n", idstr, exstr);
+		if (sub_num > 0)
+			sub_num++;
+		if (exp_fail)
+			printf("%s failed as expected\n", teststr);
+		else
+			{
+			printf("%s Failed Incorrectly!!\n", teststr);
+			st_err++;
+			}
 		break;
 
 		case FIPS_POST_CORRUPT:
-		if (fail_id == id
-			&& (fail_key == -1 || fail_key == keytype)
-			&& (fail_sub == -1 || fail_sub == subid))
+		if (exp_fail)
 			{
-			printf("\t\t%s %s test failure induced\n", idstr, exstr);
+			printf("%s failure induced\n", teststr);
 			return 0;
 			}
 		break;
@@ -995,110 +1297,433 @@ static int post_cb(int op, int id, int subid, void *ex)
 	return 1;
 	}
 
-int main(int argc,char **argv)
+static int do_fail_all(int fullpost, int fullerr)
+	{
+	int rv;
+	size_t i;
+	int sub_fail;
+	RSA *rsa = NULL;
+	DSA *dsa = NULL;
+	DRBG_CTX *dctx = NULL, *defctx = NULL;
+	EC_KEY *ec = NULL;
+	BIGNUM *bn = NULL;
+	unsigned char key[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+	EVP_CIPHER_CTX ctx;
+	unsigned char out[10];
+	if (!fullpost)
+		post_quiet = 1;
+	if (!fullerr)
+		no_err = 1;
+	fips_module_mode_set_verbose(0, NULL);
+	for (sub_fail = 1; sub_fail < sub_count; sub_fail++)
+		{
+		sub_fail_num = sub_fail;
+		printf("    Testing induced failure of POST subtest %d\n",
+								sub_fail);
+		rv = fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS);
+		if (rv)
+			{
+			printf("\tFIPS mode incorrectly successful!!\n");
+			st_err++;
+			}
+		printf("\tAttempting crypto operation after failed POST... ");
+		FIPS_cipher_ctx_init(&ctx);
+		rv = FIPS_cipherinit(&ctx, EVP_aes_128_ecb(), key, NULL, 1);
+		if (rv > 0)
+			{
+			printf("succeeded incorrectly!!\n");
+			st_err++;
+			}
+		else
+			printf("failed as expected.\n");
+		FIPS_cipher_ctx_cleanup(&ctx);
+		}
+	sub_fail_num = -1;
+	printf("    Testing induced failure of RSA keygen test\n");
+	/* NB POST will succeed with a pairwise test failures as
+	 * it is not used during POST.
+	 */
+	fail_id = FIPS_TEST_PAIRWISE;
+	fail_key = EVP_PKEY_RSA;
+	/* Now enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+
+	rsa = FIPS_rsa_new();
+	bn = BN_new();
+ 	if (!rsa || !bn)
+		return 0;
+	BN_set_word(bn, 65537);
+	if (RSA_generate_key_ex(rsa, 2048,bn,NULL))
+		{
+		printf("\tRSA key generated OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tRSA key generation failed as expected.\n");
+
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+
+	printf("    Testing induced failure of DSA keygen test\n");
+	fail_key = EVP_PKEY_DSA;
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+	dsa = FIPS_dsa_new();
+    	if (!dsa)
+		return 0;
+	if (!DSA_generate_parameters_ex(dsa, 1024,NULL,0,NULL,NULL,NULL))
+		return 0;
+    	if (DSA_generate_key(dsa))
+		{
+		printf("\tDSA key generated OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tDSA key generation failed as expected.\n");
+
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+
+	printf("    Testing induced failure of ECDSA keygen test\n");
+	fail_key = EVP_PKEY_EC;
+
+	ec = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
+
+	if (!ec)
+		return 0;
+
+    	if (EC_KEY_generate_key(ec))
+		{
+		printf("\tECDSA key generated OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tECDSA key generation failed as expected.\n");
+
+	FIPS_ec_key_free(ec);
+	ec = NULL;
+
+	fail_id = -1;
+	fail_sub = -1;
+	fail_key = -1;
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+	/* Induce continuous PRNG failure for DRBG */
+	printf("    Testing induced failure of DRBG CPRNG test\n");
+	FIPS_drbg_stick(1);
+
+	/* Initialise a DRBG context */
+	dctx = FIPS_drbg_new(NID_sha1, 0);
+	if (!dctx)
+		return 0;
+	for (i = 0; i < sizeof(dummy_drbg_entropy); i++)
+		{
+		dummy_drbg_entropy[i] = i & 0xff;
+		}
+	FIPS_drbg_set_callbacks(dctx, drbg_test_cb, 0, 0x10, drbg_test_cb, 0);
+	if (!FIPS_drbg_instantiate(dctx, dummy_drbg_entropy, 10))
+		{
+		printf("\tDRBG instantiate error!!\n");
+		st_err++;
+		}
+	if (FIPS_drbg_generate(dctx, out, sizeof(out), 0, NULL, 0))
+		{
+		printf("\tDRBG continuous PRNG OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tDRBG continuous PRNG failed as expected\n");
+	FIPS_drbg_stick(0);
+
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+
+	FIPS_drbg_free(dctx);
+
+	/* Induce continuous PRNG failure for DRBG entropy source*/
+	printf("    Testing induced failure of DRBG entropy CPRNG test\n");
+
+	/* Initialise a DRBG context */
+	dctx = FIPS_drbg_new(NID_sha1, 0);
+	if (!dctx)
+		return 0;
+	for (i = 0; i < sizeof(dummy_drbg_entropy); i++)
+		{
+		dummy_drbg_entropy[i] = i & 0xf;
+		}
+	FIPS_drbg_set_callbacks(dctx, drbg_test_cb, 0, 0x10, drbg_test_cb, 0);
+	if (FIPS_drbg_instantiate(dctx, dummy_drbg_entropy, 10))
+		{
+		printf("\tDRBG continuous PRNG entropy OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tDRBG continuous PRNG entropy failed as expected\n");
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+	FIPS_drbg_free(dctx);
+
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+
+	printf("    Testing induced failure of X9.31 CPRNG test\n");
+	FIPS_x931_stick(1);
+	if (!FIPS_x931_set_key(dummy_drbg_entropy, 32))
+		{
+		printf("\tError initialiasing X9.31 PRNG\n");
+		st_err++;
+		}
+	if (!FIPS_x931_seed(dummy_drbg_entropy + 32, 16))
+		{
+		printf("\tError seeding X9.31 PRNG\n");
+		st_err++;
+		}
+	if (FIPS_x931_bytes(out, 10) > 0)
+		{
+		printf("\tX9.31 continuous PRNG failure OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tX9.31 continuous PRNG failed as expected\n");
+	FIPS_x931_stick(0);
+
+	/* Leave FIPS mode to clear error */
+	fips_module_mode_set_verbose(0, NULL);
+	/* Enter FIPS mode successfully */
+	if (!fips_module_mode_set_verbose(1, FIPS_AUTH_USER_PASS))
+		{
+		printf("\tError entering FIPS mode\n");
+		st_err++;
+		}
+
+	printf("    Testing operation failure with DRBG entropy failure\n");
+
+	/* Generate DSA key for later use */
+    	if (DSA_generate_key(dsa))
+		printf("\tDSA key generated OK as expected.\n");
+	else
+		{
+		printf("\tDSA key generation FAILED!!\n");
+		st_err++;
+		}
+
+	/* Initialise default DRBG context */
+	defctx = FIPS_get_default_drbg();
+	if (!defctx)
+		return 0;
+	if (!FIPS_drbg_init(defctx, NID_sha512, 0))
+		return 0;
+	/* Set entropy failure callback */
+	FIPS_drbg_set_callbacks(defctx, drbg_fail_cb, 0, 0x10, drbg_test_cb, 0);
+	if (FIPS_drbg_instantiate(defctx, dummy_drbg_entropy, 10))
+		{
+		printf("\tDRBG entropy fail OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tDRBG entropy fail failed as expected\n");
+
+	if (FIPS_dsa_sign(dsa, dummy_drbg_entropy, 5, EVP_sha256()))
+		{
+		printf("\tDSA signing OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tDSA signing failed as expected\n");
+
+	ec = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
+
+	if (!ec)
+		return 0;
+
+    	if (EC_KEY_generate_key(ec))
+		{
+		printf("\tECDSA key generated OK incorrectly!!\n");
+		st_err++;
+		}
+	else
+		printf("\tECDSA key generation failed as expected.\n");
+
+	printf("  Induced failure test completed with %d errors\n", st_err);
+	post_quiet = 0; 
+	no_err = 0;
+	BN_free(bn);
+	FIPS_rsa_free(rsa);
+	FIPS_dsa_free(dsa);
+	FIPS_ec_key_free(ec);
+	if (st_err)
+		return 0;
+	return 1;
+	}
+
+#ifdef FIPS_ALGVS
+int fips_test_suite_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
     {
+    char **args = argv + 1;
     int bad_rsa = 0, bad_dsa = 0;
     int do_rng_stick = 0;
     int do_drbg_stick = 0;
     int no_exit = 0;
-    int no_dh = 0;
+    int no_dh = 0, no_drbg = 0;
     char *pass = FIPS_AUTH_USER_PASS;
+    int fullpost = 0, fullerr = 0;
 
     FIPS_post_set_callback(post_cb);
 
+#if (defined(__arm__) || defined(__aarch64__))
+    extern unsigned int OPENSSL_armcap_P;
+    if (0 == OPENSSL_armcap_P)
+	fprintf(stderr, "Optimizations disabled\n");
+#endif
+
     printf("\tFIPS-mode test application\n");
 
     printf("\t%s\n\n", FIPS_module_version_text());
 
-    if (argv[1]) {
+    while(*args) {
         /* Corrupted KAT tests */
-        if (!strcmp(argv[1], "integrity")) {
+        if (!strcmp(*args, "integrity")) {
 	    fail_id = FIPS_TEST_INTEGRITY;
-        } else if (!strcmp(argv[1], "aes")) {
+        } else if (!strcmp(*args, "aes")) {
 	    fail_id = FIPS_TEST_CIPHER;
 	    fail_sub = NID_aes_128_ecb;	
-        } else if (!strcmp(argv[1], "aes-ccm")) {
+        } else if (!strcmp(*args, "aes-ccm")) {
 	    fail_id = FIPS_TEST_CCM;
-        } else if (!strcmp(argv[1], "aes-gcm")) {
+        } else if (!strcmp(*args, "aes-gcm")) {
 	    fail_id = FIPS_TEST_GCM;
-        } else if (!strcmp(argv[1], "aes-xts")) {
+        } else if (!strcmp(*args, "aes-xts")) {
 	    fail_id = FIPS_TEST_XTS;
-        } else if (!strcmp(argv[1], "des")) {
+        } else if (!strcmp(*args, "des")) {
 	    fail_id = FIPS_TEST_CIPHER;
 	    fail_sub = NID_des_ede3_ecb;	
-        } else if (!strcmp(argv[1], "dsa")) {
+        } else if (!strcmp(*args, "dsa")) {
 	    fail_id = FIPS_TEST_SIGNATURE;
 	    fail_key = EVP_PKEY_DSA;	
         } else if (!strcmp(argv[1], "ecdh")) {
 	    fail_id = FIPS_TEST_ECDH;
-        } else if (!strcmp(argv[1], "ecdsa")) {
+        } else if (!strcmp(*args, "ecdsa")) {
 	    fail_id = FIPS_TEST_SIGNATURE;
 	    fail_key = EVP_PKEY_EC;	
-        } else if (!strcmp(argv[1], "rsa")) {
+        } else if (!strcmp(*args, "rsa")) {
 	    fail_id = FIPS_TEST_SIGNATURE;
 	    fail_key = EVP_PKEY_RSA;	
-        } else if (!strcmp(argv[1], "rsakey")) {
+        } else if (!strcmp(*args, "rsakey")) {
             printf("RSA key generation and signature validation with corrupted key...\n");
 	    bad_rsa = 1;
 	    no_exit = 1;
-        } else if (!strcmp(argv[1], "rsakeygen")) {
+        } else if (!strcmp(*args, "rsakeygen")) {
 	    fail_id = FIPS_TEST_PAIRWISE;
 	    fail_key = EVP_PKEY_RSA;
 	    no_exit = 1;
-        } else if (!strcmp(argv[1], "dsakey")) {
+        } else if (!strcmp(*args, "dsakey")) {
             printf("DSA key generation and signature validation with corrupted key...\n");
 	    bad_dsa = 1;
 	    no_exit = 1;
-        } else if (!strcmp(argv[1], "dsakeygen")) {
+        } else if (!strcmp(*args, "dsakeygen")) {
 	    fail_id = FIPS_TEST_PAIRWISE;
 	    fail_key = EVP_PKEY_DSA;
 	    no_exit = 1;
-        } else if (!strcmp(argv[1], "sha1")) {
+        } else if (!strcmp(*args, "sha1")) {
 	    fail_id = FIPS_TEST_DIGEST;
-        } else if (!strcmp(argv[1], "hmac")) {
+        } else if (!strcmp(*args, "hmac")) {
 	    fail_id = FIPS_TEST_HMAC;
-        } else if (!strcmp(argv[1], "cmac")) {
+        } else if (!strcmp(*args, "cmac")) {
 	    fail_id = FIPS_TEST_CMAC;
-	} else if (!strcmp(argv[1], "drbg")) {
+	} else if (!strcmp(*args, "drbg")) {
 	    fail_id = FIPS_TEST_DRBG;
 	} else if (!strcmp(argv[1], "rng")) {
 	    fail_id = FIPS_TEST_X931;
-	} else if (!strcmp(argv[1], "nodh")) {
+	} else if (!strcmp(*args, "nodrbg")) {
+	    no_drbg = 1;
+	    no_exit = 1;
+	} else if (!strcmp(*args, "nodh")) {
 	    no_dh = 1;
 	    no_exit = 1;
-	} else if (!strcmp(argv[1], "post")) {
+	} else if (!strcmp(*args, "post")) {
 	    fail_id = -1;
-	} else if (!strcmp(argv[1], "rngstick")) {
+	} else if (!strcmp(*args, "rngstick")) {
 	    do_rng_stick = 1;
 	    no_exit = 1;
 	    printf("RNG test with stuck continuous test...\n");
-	} else if (!strcmp(argv[1], "drbgentstick")) {
+	} else if (!strcmp(*args, "drbgentstick")) {
 		do_entropy_stick();
-	} else if (!strcmp(argv[1], "drbgstick")) {
+	} else if (!strcmp(*args, "drbgstick")) {
 	    do_drbg_stick = 1;
 	    no_exit = 1;
 	    printf("DRBG test with stuck continuous test...\n");
-	} else if (!strcmp(argv[1], "user")) {
+	} else if (!strcmp(*args, "user")) {
 		pass = FIPS_AUTH_USER_PASS;
-	} else if (!strcmp(argv[1], "officer")) {
+	} else if (!strcmp(*args, "officer")) {
 		pass = FIPS_AUTH_OFFICER_PASS;
-	} else if (!strcmp(argv[1], "badpass")) {
+	} else if (!strcmp(*args, "badpass")) {
 		pass = "bad invalid password";
-	} else if (!strcmp(argv[1], "nopass")) {
+	} else if (!strcmp(*args, "nopass")) {
 		pass = "";
+	} else if (!strcmp(*args, "fullpost")) {
+		fullpost = 1;
+	    	no_exit = 1;
+	} else if (!strcmp(*args, "fullerr")) {
+		fullerr = 1;
+	    	no_exit = 1;
+	} else if (!strcmp(*args, "verbose")) {
+		verbose = 1;
+		no_exit = 1;
         } else {
-            printf("Bad argument \"%s\"\n", argv[1]);
-            exit(1);
+            printf("Bad argument \"%s\"\n", *args);
+            return 1;
         }
-	if (!no_exit) {
+    args++;
+    }
+
+    if ((argc != 1) && !no_exit) {
     		fips_algtest_init_nofips();
-        	if (!FIPS_module_mode_set(1, pass)) {
+        	if (!fips_module_mode_set_verbose(1, pass)) {
         	    printf("Power-up self test failed\n");
-		    exit(1);
+		    return 1;
 		}
         	printf("Power-up self test successful\n");
-        	exit(0);
-	}
+        	return 0;
     }
 
     fips_algtest_init_nofips();
@@ -1114,13 +1739,15 @@ int main(int argc,char **argv)
     /* Power-up self test
     */
     ERR_clear_error();
-    test_msg("2. Automatic power-up self test", FIPS_module_mode_set(1, pass));
+    test_msg("2a. Automatic power-up self test", fips_module_mode_set_verbose(1, pass));
     if (!FIPS_module_mode())
-	exit(1);
+	return 1;
     if (do_drbg_stick)
-            FIPS_drbg_stick();
+            FIPS_drbg_stick(1);
     if (do_rng_stick)
-            FIPS_x931_stick();
+            FIPS_x931_stick(1);
+
+    test_msg("2b. On demand self test", FIPS_selftest());
 
     /* AES encryption/decryption
     */
@@ -1216,9 +1843,18 @@ int main(int argc,char **argv)
 					: Fail("failed INCORRECTLY!") );
 
     printf("12. DRBG generation check...\n");
-    printf("\t%s\n", do_drbg_all() ? "successful as expected"
+    if (no_drbg)
+	printf("\tskipped\n");
+    else
+    	printf("\t%s\n", do_drbg_all() ? "successful as expected"
 					: Fail("failed INCORRECTLY!") );
 
+    test_msg("13. ECDSA key generation and signature validation",
+    						FIPS_ecdsa_test());
+
+    printf("14. Induced test failure check...\n");
+    printf("\t%s\n", do_fail_all(fullpost, fullerr) ? "successful as expected"
+					: Fail("failed INCORRECTLY!") );
     printf("\nAll tests completed with %d errors\n", Error);
     return Error ? 1 : 0;
     }
diff --git a/fips/fips_utl.h b/fips/fips_utl.h
index 1ed133c5c9..491bc2ace9 100644
--- a/fips/fips_utl.h
+++ b/fips/fips_utl.h
@@ -47,6 +47,9 @@
  *
  */
 
+#ifndef FIPS_UTL_H
+#define FIPS_UTL_H
+
 #define OPENSSL_FIPSAPI
 
 #include <openssl/fips_rand.h>
@@ -487,3 +490,5 @@ int fips_strcasecmp(const char *str1, const char *str2)
 	return fips_strncasecmp(str1, str2, (size_t)-1);
 	}
 
+
+#endif
diff --git a/fips/fipsalgtest.pl b/fips/fipsalgtest.pl
index cd6ba8c116..2e31335ae9 100644
--- a/fips/fipsalgtest.pl
+++ b/fips/fipsalgtest.pl
@@ -495,6 +495,7 @@ my $onedir = 0;
 my $filter = "";
 my $tvdir;
 my $tprefix;
+my $sfprefix = "";
 my $debug          = 0;
 my $quiet          = 0;
 my $notest         = 0;
@@ -513,29 +514,29 @@ my $mkcmd = "mkdir";
 my $cmpall = 0;
 
 my %fips_enabled = (
-    dsa         => 1,
-    dsa2        => 2,
+    "dsa"        => 1,
+    "dsa2"       => 2,
     "dsa-pqgver"  => 2,
-    ecdsa       => 2,
-    rsa         => 1,
+    "ecdsa"      => 2,
+    "rsa"        => 1,
     "rsa-pss0"  => 2,
     "rsa-pss62" => 1,
-    sha         => 1,
-    hmac        => 1,
-    cmac        => 2,
+    "sha"        => 1,
+    "hmac"       => 1,
+    "cmac"       => 2,
     "rand-aes"  => 1,
     "rand-des2" => 0,
-    aes         => 1,
+    "aes"        => 1,
     "aes-cfb1"  => 2,
-    des3        => 1,
+    "des3"       => 1,
     "des3-cfb1" => 2,
-    drbg	=> 2,
+    "drbg"	=> 2,
     "aes-ccm"	=> 2,
     "aes-xts"	=> 2,
     "aes-gcm"	=> 2,
-    dh		=> 0,
-    ecdh	=> 2,
-    v2		=> 1,
+    "dh"	=> 0,
+    "ecdh"	=> 2,
+    "v2"	=> 1,
 );
 
 foreach (@ARGV) {
@@ -615,6 +616,9 @@ foreach (@ARGV) {
     elsif (/--script-tprefix=(.*)$/) {
         $stprefix = $1;
     }
+    elsif (/--script-fprefix=(.*)$/) {
+        $sfprefix = $1;
+    }
     elsif (/--mkdir=(.*)$/) {
         $mkcmd = $1;
     }
@@ -1017,6 +1021,10 @@ END
             $out =~ s|/req/(\S+)\.req|/$rspdir/$1.rsp|;
             my $outdir = $out;
             $outdir =~ s|/[^/]*$||;
+            if ( !-d $outdir  && ($outfile eq "" || $minimal_script)) {
+                print STDERR "DEBUG: Creating directory $outdir\n" if $debug;
+                mkdir($outdir) || die "Can't create directory $outdir";
+            }
 	    if ($outfile ne "") {
 	    	if ($win32) {
 		    $outdir =~ tr|/|\\|;
@@ -1039,12 +1047,9 @@ END
 		    }
 		$lastdir = $outdir;
 		}
-            } elsif ( !-d $outdir ) {
-                print STDERR "DEBUG: Creating directory $outdir\n" if $debug;
-                mkdir($outdir) || die "Can't create directory $outdir";
             }
         }
-        my $cmd = "$tcmd \"$req\" \"$out\"";
+        my $cmd = "$tcmd \"$sfprefix$req\" \"$sfprefix$out\"";
         print STDERR "DEBUG: running test $tname\n" if ( $debug && !$verify );
 	if ($outfile ne "") {
 	    if ($minimal_script) {
diff --git a/fips/fipsld b/fips/fipsld
index 6184e2064e..62565fd032 100755
--- a/fips/fipsld
+++ b/fips/fipsld
@@ -1,6 +1,6 @@
 #!/bin/sh -e
 #
-# Copyright (c) 2005-2007 The OpenSSL Project.
+# Copyright (c) 2005-2011 The OpenSSL Project.
 #
 # Depending on output file name, the script either embeds fingerprint
 # into libcrypto.so or static application. "Static" refers to static
@@ -127,12 +127,15 @@ lib*|*.dll)	# must be linking a shared lib...
 		"${PREMAIN_C}" \
 		${_WL_PREMAIN} "$@"
 
-	# generate signature...
-	if [ -z "${FIPS_SIG}" ]; then
-		SIG=`"${PREMAIN_DSO}" "${TARGET}"`
-	else
-		SIG=`"${FIPS_SIG}" -dso "${TARGET}"`
+	if [ "x${FIPS_SIG}" != "x" ]; then
+		# embed signature
+		"${FIPS_SIG}" "${TARGET}"
+		[ $? -ne 42 ] && exit $?
 	fi
+
+	# generate signature...
+	SIG=`"${PREMAIN_DSO}" "${TARGET}"`
+
 	/bin/rm -f "${TARGET}"
 	if [ -z "${SIG}" ]; then
 	   echo "unable to collect signature"; exit 1
@@ -172,12 +175,15 @@ lib*|*.dll)	# must be linking a shared lib...
 		"${PREMAIN_C}" \
 		${_WL_PREMAIN} "$@"
 
-	# generate signature...
-	if [ -z "${FIPS_SIG}" ]; then
-		SIG=`"${TARGET}"`
-	else
-		SIG=`"${FIPS_SIG}" -exe "${TARGET}"`
+	if [ "x${FIPS_SIG}" != "x" ]; then
+		# embed signature
+		"${FIPS_SIG}" "${TARGET}"
+		[ $? -ne 42 ] && exit $?
 	fi
+
+	# generate signature...
+	SIG=`"${TARGET}"`
+
 	/bin/rm -f "${TARGET}"
 	if [ -z "${SIG}" ]; then
 	   echo "unable to collect signature"; exit 1
diff --git a/fips/fipssyms.h b/fips/fipssyms.h
index 5b1e188785..8f04eb9dcf 100644
--- a/fips/fipssyms.h
+++ b/fips/fipssyms.h
@@ -589,6 +589,7 @@
 #define AES_encrypt fips_aes_encrypt
 #define AES_set_decrypt_key fips_aes_set_decrypt_key
 #define AES_set_encrypt_key fips_aes_set_encrypt_key
+#define AES_ctr32_encrypt fips_aes_ctr32_encrypt
 #define BN_from_montgomery fips_bn_from_montgomery
 #define BN_num_bits_word FIPS_bn_num_bits_word
 #define DES_SPtrans fips_des_sptrans
@@ -667,6 +668,67 @@
 #define bn_mul_mont_gather5 fips_bn_mul_mont_gather5
 #define bn_scatter5 fips_bn_scatter5
 #define bn_gather5 fips_bn_gather5
+#define _armv8_aes_probe _fips_armv8_aes_probe
+#define _armv8_pmull_probe _fips_armv8_pmull_probe
+#define _armv8_sha1_probe _fips_armv8_sha1_probe
+#define _armv8_sha256_probe _fips_armv8_sha256_probe
+#define aes_v8_encrypt fips_aes_v8_encrypt
+#define aes_v8_decrypt fips_aes_v8_decrypt
+#define aes_v8_set_encrypt_key fips_aes_v8_set_encrypt_key
+#define aes_v8_set_decrypt_key fips_aes_v8_set_decrypt_key
+#define aes_v8_cbc_encrypt fips_aes_v8_cbc_encrypt
+#define aes_v8_ctr32_encrypt_blocks fips_aes_v8_ctr32_encrypt_blocks
+#define gcm_init_v8 fips_gcm_init_v8
+#define gcm_gmult_v8 fips_gcm_gmult_v8
+#define gcm_ghash_v8 fips_gcm_ghash_v8
+#if defined(__APPLE__) && __ASSEMBLER__
+#define _OPENSSL_armcap_P _fips_openssl_armcap_P
+#define __armv7_neon_probe __fips_armv7_neon_probe
+#define __armv7_tick __fips_armv7_tick
+#define __armv8_aes_probe __fips_armv8_aes_probe
+#define __armv8_pmull_probe __fips_armv8_pmull_probe
+#define __armv8_sha1_probe __fips_armv8_sha1_probe
+#define __armv8_sha256_probe __fips_armv8_sha256_probe
+#define _aes_v8_encrypt _fips_aes_v8_encrypt
+#define _aes_v8_decrypt _fips_aes_v8_decrypt
+#define _aes_v8_set_encrypt_key _fips_aes_v8_set_encrypt_key
+#define _aes_v8_set_decrypt_key _fips_aes_v8_set_decrypt_key
+#define _aes_v8_cbc_encrypt _fips_aes_v8_cbc_encrypt
+#define _aes_v8_ctr32_encrypt_blocks _fips_aes_v8_ctr32_encrypt_blocks
+#define _gcm_init_v8 _fips_gcm_init_v8
+#define _gcm_gmult_v8 _fips_gcm_gmult_v8
+#define _gcm_ghash_v8 _fips_gcm_ghash_v8
+#define _sha1_block_data_order _fips_sha1_block_data_order
+#define _sha256_block_data_order _fips_sha256_block_data_order
+#define _sha512_block_data_order _fips_sha512_block_data_order
+#define _AES_decrypt _fips_aes_decrypt
+#define _AES_encrypt _fips_aes_encrypt
+#define _AES_set_decrypt_key _fips_aes_set_decrypt_key
+#define _AES_set_encrypt_key _fips_aes_set_encrypt_key
+#define _gcm_gmult_4bit _fips_gcm_gmult_4bit
+#define _gcm_ghash_4bit _fips_gcm_ghash_4bit
+#define _gcm_gmult_neon _fips_gcm_gmult_neon
+#define _gcm_ghash_neon _fips_gcm_ghash_neon
+#define _bn_GF2m_mul_2x2 _fips_bn_GF2m_mul_2x2
+#define _OPENSSL_cleanse _FIPS_openssl_cleanse
+#endif
+#define aes_p8_encrypt fips_aes_p8_encrypt
+#define aes_p8_decrypt fips_aes_p8_decrypt
+#define aes_p8_set_encrypt_key fips_aes_p8_set_encrypt_key
+#define aes_p8_set_decrypt_key fips_aes_p8_set_decrypt_key
+#define aes_p8_cbc_encrypt fips_aes_p8_cbc_encrypt
+#define aes_p8_ctr32_encrypt_blocks fips_aes_p8_ctr32_encrypt_blocks
+#define aes_p8_xts_encrypt fips_aes_p8_xts_encrypt
+#define aes_p8_xts_decrypt fips_aes_p8_xts_decrypt
+#define gcm_init_p8 fips_gcm_init_p8
+#define gcm_gmult_p8 fips_gcm_gmult_p8
+#define gcm_ghash_p8 fips_gcm_ghash_p8
+#define sha256_block_p8 fips_sha256_block_p8
+#define sha512_block_p8 fips_sha512_block_p8
+#define sha256_block_ppc fips_sha256_block_ppc
+#define sha512_block_ppc fips_sha512_block_ppc
+#define OPENSSL_ppccap_P fips_openssl_ppccap_p
+#define OPENSSL_crypto207_probe fips_openssl_crypto207_probe
 
 #if defined(_MSC_VER)
 # pragma const_seg("fipsro$b")
diff --git a/fips/hmac/fips_hmactest.c b/fips/hmac/fips_hmactest.c
index 07c18bfdfa..da9c8d7926 100644
--- a/fips/hmac/fips_hmactest.c
+++ b/fips/hmac/fips_hmactest.c
@@ -85,7 +85,11 @@ static int print_hmac(const EVP_MD *md, FILE *out,
 		unsigned char *Key, int Klen,
 		unsigned char *Msg, int Msglen, int Tlen);
 
+#ifdef FIPS_ALGVS
+int fips_hmactest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 
diff --git a/fips/rand/fips_drbg_lib.c b/fips/rand/fips_drbg_lib.c
index 1596977fd5..ee162d05eb 100644
--- a/fips/rand/fips_drbg_lib.c
+++ b/fips/rand/fips_drbg_lib.c
@@ -154,6 +154,8 @@ static size_t fips_get_entropy(DRBG_CTX *dctx, unsigned char **pout,
 	{
 	unsigned char *tout, *p;
 	size_t bl = dctx->entropy_blocklen, rv;
+	if (!dctx->get_entropy)
+		return 0;
 	if (dctx->xflags & DRBG_FLAG_TEST || !bl)
 		return dctx->get_entropy(dctx, pout, entropy, min_len, max_len);
 	rv = dctx->get_entropy(dctx, &tout, entropy + bl,
@@ -241,7 +243,7 @@ int FIPS_drbg_instantiate(DRBG_CTX *dctx,
 		goto end;
 		}
 
-	if (dctx->max_nonce > 0)
+	if (dctx->max_nonce > 0 && dctx->get_nonce)
 		{
 		noncelen = dctx->get_nonce(dctx, &nonce,
 					dctx->strength / 2,
@@ -544,9 +546,9 @@ void FIPS_drbg_set_reseed_interval(DRBG_CTX *dctx, int interval)
 
 static int drbg_stick = 0;
 
-void FIPS_drbg_stick(void)
+void FIPS_drbg_stick(int onoff)
 	{
-	drbg_stick = 1;
+	drbg_stick = onoff;
 	}
 
 /* Continuous DRBG utility function */
diff --git a/fips/rand/fips_drbg_selftest.c b/fips/rand/fips_drbg_selftest.c
index ee0561bcbe..a787323d6d 100644
--- a/fips/rand/fips_drbg_selftest.c
+++ b/fips/rand/fips_drbg_selftest.c
@@ -582,7 +582,6 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td)
 		}
 		
 	dctx->iflags &= ~DRBG_FLAG_NOERR;
-
 	if (!FIPS_drbg_uninstantiate(dctx))
 		{
 		FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR);
@@ -617,28 +616,20 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td)
 		goto err;
 		}
 
-	/* Explicit reseed tests */
-
-	/* Test explicit reseed with too large additional input */
-	if (!do_drbg_init(dctx, td, &t))
-		goto err;
-
-	dctx->iflags |= DRBG_FLAG_NOERR;
-
-	if (FIPS_drbg_reseed(dctx, td->adin, dctx->max_adin + 1) > 0)
+	dctx->iflags &= ~DRBG_FLAG_NOERR;
+	if (!FIPS_drbg_uninstantiate(dctx))
 		{
-		FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_ADDITIONAL_INPUT_ERROR_UNDETECTED);
+		FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR);
 		goto err;
 		}
 
-	/* Test explicit reseed with entropy source failure */
-
 	/* Check prediction resistance request fails if entropy source
 	 * failure.
 	 */
 
 	t.entlen = 0;
 
+	dctx->iflags |= DRBG_FLAG_NOERR;
 	if (FIPS_drbg_generate(dctx, randout, td->katlen, 1,
 				td->adin, td->adinlen))
 		{
@@ -680,6 +671,13 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td)
 		goto err;
 		}
 
+	dctx->iflags &= ~DRBG_FLAG_NOERR;
+	if (!FIPS_drbg_uninstantiate(dctx))
+		{
+		FIPSerr(FIPS_F_FIPS_DRBG_ERROR_CHECK, FIPS_R_UNINSTANTIATE_ERROR);
+		goto err;
+		}
+
 	/* Explicit reseed tests */
 
 	/* Test explicit reseed with too large additional input */
@@ -696,11 +694,6 @@ static int fips_drbg_error_check(DRBG_CTX *dctx, DRBG_SELFTEST_DATA *td)
 
 	/* Test explicit reseed with entropy source failure */
 
-	if (!do_drbg_init(dctx, td, &t))
-		goto err;
-
-	dctx->iflags |= DRBG_FLAG_NOERR;
-
 	t.entlen = 0;
 
 	if (FIPS_drbg_reseed(dctx, td->adin, td->adinlen) > 0)
diff --git a/fips/rand/fips_drbgvs.c b/fips/rand/fips_drbgvs.c
index 4d3f0cfee0..214e3c340a 100644
--- a/fips/rand/fips_drbgvs.c
+++ b/fips/rand/fips_drbgvs.c
@@ -76,7 +76,7 @@ int main(int argc, char **argv)
 
 #include "fips_utl.h"
 
-static int parse_md(char *str)
+static int dparse_md(char *str)
 	{
 	switch(atoi(str + 5))
 		{
@@ -115,7 +115,7 @@ static int parse_ec(char *str)
 		curve_nid = NID_secp521r1;
 	else
 		return NID_undef;
-	md_nid = parse_md(md);
+	md_nid = dparse_md(md);
 	if (md_nid == NID_undef)
 		return NID_undef;
 	return (curve_nid << 16) | md_nid;
@@ -170,17 +170,19 @@ static size_t test_nonce(DRBG_CTX *dctx, unsigned char **pout,
 	return t->noncelen;
 	}
 
-
-
+#ifdef FIPS_ALGVS
+int fips_drbgvs_main(int argc,char **argv)
+#else
 int main(int argc,char **argv)
+#endif
 	{
-	FILE *in, *out;
+	FILE *in = NULL, *out = NULL;
 	DRBG_CTX *dctx = NULL;
 	TEST_ENT t;
 	int r, nid = 0;
 	int pr = 0;
 	char buf[2048], lbuf[2048];
-	unsigned char randout[2048];
+	unsigned char *randout = NULL;
 	char *keyword = NULL, *value = NULL;
 
 	unsigned char *ent = NULL, *nonce = NULL, *pers = NULL, *adin = NULL;
@@ -240,7 +242,7 @@ int main(int argc,char **argv)
 			}
 		if (strlen(buf) > 4 && !strncmp(buf, "[SHA-", 5))
 			{
-			nid = parse_md(buf);
+			nid = dparse_md(buf);
 			if (nid == NID_undef)
 				exit(1);
 			if (drbg_type == DRBG_HMAC)
@@ -296,6 +298,8 @@ int main(int argc,char **argv)
 			else
 				exit(1);
 			}
+		if (!strcmp(keyword, "[ReturnedBitsLen"))
+			randoutlen = atoi(value) / 8;
 
 		if (!strcmp(keyword, "EntropyInput"))
 			{
@@ -325,7 +329,11 @@ int main(int argc,char **argv)
 			FIPS_drbg_set_callbacks(dctx, test_entropy, 0, 0,
 							test_nonce, 0);
 			FIPS_drbg_set_app_data(dctx, &t);
-			randoutlen = (int)FIPS_drbg_get_blocklength(dctx);
+			if (randoutlen == 0)
+				randoutlen = (int)FIPS_drbg_get_blocklength(dctx);
+			if (randout)
+				OPENSSL_free(randout);
+			randout = OPENSSL_malloc(randoutlen);
 			r = FIPS_drbg_instantiate(dctx, pers, perslen);
 			if (!r)
 				{
@@ -404,6 +412,12 @@ int main(int argc,char **argv)
 			}
 
 		}
+	if (randout)
+		OPENSSL_free(randout);
+	if (in && in != stdin)
+		fclose(in);
+	if (out && out != stdout)
+		fclose(out);
 	return 0;
 	}
 
diff --git a/fips/rand/fips_rand.c b/fips/rand/fips_rand.c
index cb9184e1f7..9904d8aa6f 100644
--- a/fips/rand/fips_rand.c
+++ b/fips/rand/fips_rand.c
@@ -66,7 +66,7 @@
 #include <openssl/aes.h>
 #include <openssl/err.h>
 #include <openssl/fips_rand.h>
-#if !(defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS))
+#if !(defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYSNAME_DSPBIOS))
 # include <sys/time.h>
 #endif
 #if defined(OPENSSL_SYS_VXWORKS)
@@ -114,9 +114,9 @@ static FIPS_PRNG_CTX sctx;
 
 static int fips_prng_fail = 0;
 
-void FIPS_x931_stick(void)
+void FIPS_x931_stick(int onoff)
 	{
-	fips_prng_fail = 1;
+	fips_prng_fail = onoff;
 	}
 
 static void fips_rand_prng_reset(FIPS_PRNG_CTX *ctx)
@@ -232,8 +232,13 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr)
 	{
 #ifdef OPENSSL_SYS_WIN32
 	FILETIME ft;
+#ifdef _WIN32_WCE
+	SYSTEMTIME t;
+#endif
 #elif defined(OPENSSL_SYS_VXWORKS)
         struct timespec ts;
+#elif defined(OPENSSL_SYSNAME_DSPBIOS)
+	unsigned long long TSC, OPENSSL_rdtsc();
 #else
 	struct timeval tv;
 #endif
@@ -243,7 +248,12 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr)
 #endif
 
 #ifdef OPENSSL_SYS_WIN32
+#ifdef _WIN32_WCE
+	GetSystemTime(&t);
+	SystemTimeToFileTime(&t, &ft);
+#else
 	GetSystemTimeAsFileTime(&ft);
+#endif
 	buf[0] = (unsigned char) (ft.dwHighDateTime & 0xff);
 	buf[1] = (unsigned char) ((ft.dwHighDateTime >> 8) & 0xff);
 	buf[2] = (unsigned char) ((ft.dwHighDateTime >> 16) & 0xff);
@@ -262,6 +272,16 @@ void FIPS_get_timevec(unsigned char *buf, unsigned long *pctr)
 	buf[5] = (unsigned char) ((ts.tv_nsec >> 8) & 0xff);
 	buf[6] = (unsigned char) ((ts.tv_nsec >> 16) & 0xff);
 	buf[7] = (unsigned char) ((ts.tv_nsec >> 24) & 0xff);
+#elif defined(OPENSSL_SYSNAME_DSPBIOS)
+	TSC = OPENSSL_rdtsc();
+	buf[0] = (unsigned char) (TSC & 0xff);
+	buf[1] = (unsigned char) ((TSC >> 8) & 0xff);
+	buf[2] = (unsigned char) ((TSC >> 16) & 0xff);
+	buf[3] = (unsigned char) ((TSC >> 24) & 0xff);
+	buf[4] = (unsigned char) ((TSC >> 32) & 0xff);
+	buf[5] = (unsigned char) ((TSC >> 40) & 0xff);
+	buf[6] = (unsigned char) ((TSC >> 48) & 0xff);
+	buf[7] = (unsigned char) ((TSC >> 56) & 0xff);
 #else
 	gettimeofday(&tv,NULL);
 	buf[0] = (unsigned char) (tv.tv_sec & 0xff);
diff --git a/fips/rand/fips_rand_selftest.c b/fips/rand/fips_rand_selftest.c
index bafce719ca..ec949cbdbb 100644
--- a/fips/rand/fips_rand_selftest.c
+++ b/fips/rand/fips_rand_selftest.c
@@ -129,15 +129,16 @@ static AES_PRNG_TV aes_256_tv =
 static int do_x931_test(unsigned char *key, int keylen,
 			AES_PRNG_TV *tv)
 	{
-	unsigned char R[16];
+	unsigned char R[16], V[16];
 	int rv = 1;
+	memcpy(V, tv->V, sizeof(V));
 	if (!FIPS_x931_set_key(key, keylen))
 		return 0;
 	if (!fips_post_started(FIPS_TEST_X931, keylen, NULL))
 		return 1;
 	if (!fips_post_corrupt(FIPS_TEST_X931, keylen, NULL))
-		tv->V[0]++;
-	FIPS_x931_seed(tv->V, 16);
+		V[0]++;
+	FIPS_x931_seed(V, 16);
 	FIPS_x931_set_dt(tv->DT);
 	FIPS_x931_bytes(R, 16);
 	if (memcmp(R, tv->R, 16))
diff --git a/fips/rand/fips_rngvs.c b/fips/rand/fips_rngvs.c
index ac0a526573..9e1f070602 100644
--- a/fips/rand/fips_rngvs.c
+++ b/fips/rand/fips_rngvs.c
@@ -198,7 +198,11 @@ static void mct(FILE *in, FILE *out)
 	}
     }
 
-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_rngvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
     {
     FILE *in, *out;
     if (argc == 4)
diff --git a/fips/rsa/fips_rsa_sign.c b/fips/rsa/fips_rsa_sign.c
index 013333e0b4..4956971f04 100644
--- a/fips/rsa/fips_rsa_sign.c
+++ b/fips/rsa/fips_rsa_sign.c
@@ -288,8 +288,11 @@ int FIPS_rsa_sign_digest(RSA *rsa, const unsigned char *md, int md_len,
 			*siglen=j;
 			}
 		psserr:
-		OPENSSL_cleanse(sbuf, i);
-		OPENSSL_free(sbuf);
+		if (sbuf)
+			{
+			OPENSSL_cleanse(sbuf, i);
+			OPENSSL_free(sbuf);
+			}
 		return ret;
 		}
 
@@ -442,4 +445,33 @@ err:
 	return(ret);
 	}
 
+int FIPS_rsa_sign(RSA *rsa, const unsigned char *msg, int msglen,
+			const EVP_MD *mhash, int rsa_pad_mode, int saltlen,
+			const EVP_MD *mgf1Hash,
+			unsigned char *sigret, unsigned int *siglen)
+	{
+	unsigned int md_len, rv;
+	unsigned char md[EVP_MAX_MD_SIZE];
+        FIPS_digest(msg, msglen, md, &md_len, mhash);
+	rv = FIPS_rsa_sign_digest(rsa, md, md_len, mhash, rsa_pad_mode,
+					saltlen, mgf1Hash, sigret, siglen);
+	OPENSSL_cleanse(md, md_len);
+	return rv;
+	}
+
+
+int FIPS_rsa_verify(RSA *rsa, const unsigned char *msg, int msglen,
+			const EVP_MD *mhash, int rsa_pad_mode, int saltlen,
+			const EVP_MD *mgf1Hash,
+			const unsigned char *sigbuf, unsigned int siglen)
+	{
+	unsigned int md_len, rv;
+	unsigned char md[EVP_MAX_MD_SIZE];
+        FIPS_digest(msg, msglen, md, &md_len, mhash);
+	rv = FIPS_rsa_verify_digest(rsa, md, md_len, mhash, rsa_pad_mode,
+					saltlen, mgf1Hash, sigbuf, siglen);
+	OPENSSL_cleanse(md, md_len);
+	return rv;
+	}
+
 #endif
diff --git a/fips/rsa/fips_rsagtest.c b/fips/rsa/fips_rsagtest.c
index 78b4531398..8342f615fb 100644
--- a/fips/rsa/fips_rsagtest.c
+++ b/fips/rsa/fips_rsagtest.c
@@ -88,7 +88,11 @@ static int rsa_printkey1(FILE *out, RSA *rsa,
 static int rsa_printkey2(FILE *out, RSA *rsa,
 		BIGNUM *Xq1, BIGNUM *Xq2, BIGNUM *Xq);
 
+#ifdef FIPS_ALGVS
+int fips_rsagtest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 
diff --git a/fips/rsa/fips_rsastest.c b/fips/rsa/fips_rsastest.c
index e0dbe2a0d7..a96f277e6a 100644
--- a/fips/rsa/fips_rsastest.c
+++ b/fips/rsa/fips_rsastest.c
@@ -85,7 +85,11 @@ static int rsa_stest(FILE *out, FILE *in, int Saltlen);
 static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst,
 		unsigned char *Msg, long Msglen, int Saltlen);
 
+#ifdef FIPS_ALGVS
+int fips_rsastest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 
@@ -321,15 +325,12 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst,
 	unsigned char *sigbuf = NULL;
 	int i, siglen, pad_mode;
 	/* EVP_PKEY structure */
-	EVP_MD_CTX ctx;
 
 	siglen = RSA_size(rsa);
 	sigbuf = OPENSSL_malloc(siglen);
 	if (!sigbuf)
 		goto error;
 
-	FIPS_md_ctx_init(&ctx);
-
 	if (Saltlen >= 0)
 		pad_mode = RSA_PKCS1_PSS_PADDING;
 	else if (Saltlen == -2)
@@ -337,16 +338,10 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst,
 	else
 		pad_mode = RSA_PKCS1_PADDING;
 
-	if (!FIPS_digestinit(&ctx, dgst))
-		goto error;
-	if (!FIPS_digestupdate(&ctx, Msg, Msglen))
-		goto error;
-	if (!FIPS_rsa_sign_ctx(rsa, &ctx, pad_mode, Saltlen, NULL,
+	if (!FIPS_rsa_sign(rsa, Msg, Msglen, dgst, pad_mode, Saltlen, NULL,
 				sigbuf, (unsigned int *)&siglen))
 		goto error;
 
-	FIPS_md_ctx_cleanup(&ctx);
-
 	fputs("S = ", out);
 
 	for (i = 0; i < siglen; i++)
@@ -358,6 +353,9 @@ static int rsa_printsig(FILE *out, RSA *rsa, const EVP_MD *dgst,
 
 	error:
 
+	if (sigbuf)
+		OPENSSL_free(sigbuf);
+
 	return ret;
 	}
 #endif
diff --git a/fips/rsa/fips_rsavtest.c b/fips/rsa/fips_rsavtest.c
index df33842691..9bfc5e688b 100644
--- a/fips/rsa/fips_rsavtest.c
+++ b/fips/rsa/fips_rsavtest.c
@@ -82,14 +82,18 @@ int main(int argc, char *argv[])
 
 #include "fips_utl.h"
 
-int rsa_test(FILE *out, FILE *in, int saltlen);
+int rsa_vtest(FILE *out, FILE *in, int saltlen);
 static int rsa_printver(FILE *out,
 		BIGNUM *n, BIGNUM *e,
 		const EVP_MD *dgst,
 		unsigned char *Msg, long Msglen,
 		unsigned char *S, long Slen, int Saltlen);
 
+#ifdef FIPS_ALGVS
+int fips_rsavtest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 
@@ -138,7 +142,7 @@ int main(int argc, char **argv)
 		goto end;
 		}
 
-	if (!rsa_test(out, in, Saltlen))
+	if (!rsa_vtest(out, in, Saltlen))
 		{
 		fprintf(stderr, "FATAL RSAVTEST file processing error\n");
 		goto end;
@@ -159,7 +163,7 @@ int main(int argc, char **argv)
 
 #define RSA_TEST_MAXLINELEN	10240
 
-int rsa_test(FILE *out, FILE *in, int Saltlen)
+int rsa_vtest(FILE *out, FILE *in, int Saltlen)
 	{
 	char *linebuf, *olinebuf, *p, *q;
 	char *keyword, *value;
@@ -319,7 +323,6 @@ static int rsa_printver(FILE *out,
 	int ret = 0, r, pad_mode;
 	/* Setup RSA and EVP_PKEY structures */
 	RSA *rsa_pubkey = NULL;
-	EVP_MD_CTX ctx;
 	unsigned char *buf = NULL;
 	rsa_pubkey = FIPS_rsa_new();
 	if (!rsa_pubkey)
@@ -329,8 +332,6 @@ static int rsa_printver(FILE *out,
 	if (!rsa_pubkey->n || !rsa_pubkey->e)
 		goto error;
 
-	FIPS_md_ctx_init(&ctx);
-
 	if (Saltlen >= 0)
 		pad_mode = RSA_PKCS1_PSS_PADDING;
 	else if (Saltlen == -2)
@@ -338,19 +339,11 @@ static int rsa_printver(FILE *out,
 	else
 		pad_mode = RSA_PKCS1_PADDING;
 
-	if (!FIPS_digestinit(&ctx, dgst))
-		goto error;
-	if (!FIPS_digestupdate(&ctx, Msg, Msglen))
-		goto error;
-
 	no_err = 1;
-	r = FIPS_rsa_verify_ctx(rsa_pubkey, &ctx,
+	r = FIPS_rsa_verify(rsa_pubkey, Msg, Msglen, dgst,
 				pad_mode, Saltlen, NULL, S, Slen);
 	no_err = 0;
 
-
-	FIPS_md_ctx_cleanup(&ctx);
-
 	if (r < 0)
 		goto error;
 
diff --git a/fips/sha/Makefile b/fips/sha/Makefile
index 9bc598301f..0878e7bf64 100644
--- a/fips/sha/Makefile
+++ b/fips/sha/Makefile
@@ -30,7 +30,8 @@ LIB=$(TOP)/libcrypto.a
 LIBSRC=fips_sha1_selftest.c
 LIBOBJ=fips_sha1_selftest.o
 
-SRC= $(LIBSRC) fips_standalone_sha1.c
+SRC= $(LIBSRC)
+PROGS= fips_standalone_sha1.c
 
 EXHEADER=
 HEADER=	
diff --git a/fips/sha/fips_shatest.c b/fips/sha/fips_shatest.c
index c14df16601..3954777a64 100644
--- a/fips/sha/fips_shatest.c
+++ b/fips/sha/fips_shatest.c
@@ -86,7 +86,11 @@ static int print_dgst(const EVP_MD *md, FILE *out,
 static int print_monte(const EVP_MD *md, FILE *out,
 		unsigned char *Seed, int SeedLen);
 
+#ifdef FIPS_ALGVS
+int fips_shatest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 
diff --git a/fips/utl/fips_enc.c b/fips/utl/fips_enc.c
index 1358b1f4a4..13ac4ac9f5 100644
--- a/fips/utl/fips_enc.c
+++ b/fips/utl/fips_enc.c
@@ -208,6 +208,7 @@ int FIPS_cipherinit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
 			break;
 
 			case EVP_CIPH_CTR_MODE:
+			ctx->num = 0;
 			/* Don't reuse IV for CTR mode */
 			if(iv)
 				memcpy(ctx->iv, iv, M_EVP_CIPHER_CTX_iv_length(ctx));
diff --git a/iOS/Makefile b/iOS/Makefile
new file mode 100644
index 0000000000..db26da6406
--- /dev/null
+++ b/iOS/Makefile
@@ -0,0 +1,76 @@
+#
+#  OpenSSL/iOS/Makefile
+#
+
+DIR=		iOS
+TOP=		..
+CC=		cc
+INCLUDES=	-I$(TOP) -I$(TOP)/include
+CFLAG=		-g -static
+MAKEFILE=	Makefile
+PERL=		perl
+RM=		rm -f
+
+EXE=incore_macho
+
+CFLAGS= $(INCLUDES) $(CFLAG)
+
+top:
+	@$(MAKE) -f $(TOP)/Makefile reflect THIS=exe
+
+exe:	fips_algvs.app/fips_algvs
+
+incore_macho:			incore_macho.c $(TOP)/crypto/sha/sha1dgst.c
+	$(HOSTCC) $(HOSTCFLAGS) -I$(TOP)/include -I$(TOP)/crypto -o $@ incore_macho.c $(TOP)/crypto/sha/sha1dgst.c
+
+fips_algvs.app/fips_algvs:	$(TOP)/test/fips_algvs.c $(TOP)/fips/fipscanister.o fopen.m incore_macho
+	FIPS_SIG=./incore_macho \
+	$(TOP)/fips/fipsld $(CFLAGS) -I$(TOP)/fips -o $@ \
+		$(TOP)/test/fips_algvs.c $(TOP)/fips/fipscanister.o \
+		fopen.m -framework Foundation || rm $@
+	codesign -f -s "iPhone Developer" --entitlements fips_algvs.app/Entitlements.plist fips_algvs.app || rm $@
+
+install:
+	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
+	@set -e; for i in $(EXE); \
+	do  \
+	(echo installing $$i; \
+	 cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new; \
+	 chmod 755 $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new; \
+	 mv -f $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i.new $(INSTALL_PREFIX)$(INSTALLTOP)/bin/$$i ); \
+	 done;
+	@set -e; for i in $(SCRIPTS); \
+	do  \
+	(echo installing $$i; \
+	 cp $$i $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new; \
+	 chmod 755 $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new; \
+	 mv -f $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i.new $(INSTALL_PREFIX)$(OPENSSLDIR)/misc/$$i ); \
+	 done
+
+tags:
+	ctags $(SRC)
+
+tests:
+
+links:
+
+lint:
+	lint -DLINT $(INCLUDES) $(SRC)>fluff
+
+depend:
+	@if [ -z "$(THIS)" ]; then \
+	    $(MAKE) -f $(TOP)/Makefile reflect THIS=$@; \
+	else \
+	    $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(SRC); \
+	fi
+
+dclean:
+	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
+	mv -f Makefile.new $(MAKEFILE)
+
+clean:
+	rm -f *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff $(EXE)
+	rm -f fips_algvs.app/fips_algvs
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
diff --git a/iOS/fips_algvs.app/Entitlements.plist b/iOS/fips_algvs.app/Entitlements.plist
new file mode 100644
index 0000000000..929c4e96d2
--- /dev/null
+++ b/iOS/fips_algvs.app/Entitlements.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>get-task-allow</key>
+    <true/>
+</dict>
+</plist>
\ No newline at end of file
diff --git a/iOS/fips_algvs.app/Info.plist b/iOS/fips_algvs.app/Info.plist
new file mode 100644
index 0000000000..3fd8fb4290
--- /dev/null
+++ b/iOS/fips_algvs.app/Info.plist
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleName</key>
+	<string>fips_algvs</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+	    <string>iPhoneOS</string>
+	</array>
+	<key>CFBundleExecutable</key>
+	<string>fips_algvs</string>
+	<key>CFBundleIdentifier</key>
+	<string>fips_algvs</string>
+	<key>CFBundleResourceSpecification</key>
+	<string>ResourceRules.plist</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>CFBundleDisplayName</key>
+	<string>fips_algvs</string>
+	<key>CFBundleVersion</key>
+	<string>1.0</string>
+</dict>
+</plist>
diff --git a/iOS/fips_algvs.app/ResourceRules.plist b/iOS/fips_algvs.app/ResourceRules.plist
new file mode 100644
index 0000000000..e7ec329dcc
--- /dev/null
+++ b/iOS/fips_algvs.app/ResourceRules.plist
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>rules</key>
+	<dict>
+		<key>.*</key>
+		<true/>
+		<key>Info.plist</key>
+		<dict>
+			<key>omit</key>
+			<true/>
+			<key>weight</key>
+			<real>10</real>
+		</dict>
+		<key>ResourceRules.plist</key>
+		<dict>
+			<key>omit</key>
+			<true/>
+			<key>weight</key>
+			<real>100</real>
+		</dict>
+	</dict>
+</dict>
+</plist>
diff --git a/iOS/fopen.m b/iOS/fopen.m
new file mode 100644
index 0000000000..8d2e790845
--- /dev/null
+++ b/iOS/fopen.m
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <Foundation/Foundation.h>
+ 
+static FILE *(*libc_fopen)(const char *, const char *) = NULL;
+
+__attribute__((constructor))
+static void pre_main(void)
+{
+    /*
+     * Pull reference to fopen(3) from libc.
+     */
+    void *handle = dlopen("libSystem.B.dylib",RTLD_LAZY);
+
+    if (handle) {
+        libc_fopen = dlsym(handle,"fopen");
+        dlclose(handle);
+    }
+
+    /*
+     * Change to Documents directory.
+     */
+    NSString *docs = [NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES) lastObject];
+
+    NSFileManager *filemgr = [NSFileManager defaultManager];
+    [filemgr changeCurrentDirectoryPath: docs];
+    [filemgr release];
+}
+
+char *mkdirhier(char *path)
+{
+    char *slash;
+    struct stat buf;
+
+    if (path[0]=='.' && path[1]=='/') path+=2;
+
+    if ((slash = strrchr(path,'/'))) {
+	*slash = '\0';
+	if (stat(path,&buf)==0) {
+	    *slash = '/';
+	    return NULL;
+	}
+	(void)mkdirhier(path);
+	mkdir (path,0777);
+	*slash = '/';
+    }
+
+    return slash;
+}
+/*
+ * Replacement fopen(3)
+ */
+FILE *fopen(const char *filename, const char *mode)
+{
+    FILE *ret;
+
+    if ((ret = (*libc_fopen)(filename,mode)) == NULL) {
+        /*
+         * If file is not present in Documents directory, try from Bundle.
+         */
+        NSString *nsspath = [NSString stringWithFormat:@"%@/%s",
+                                   [[NSBundle mainBundle] bundlePath],
+                                   filename];
+
+        if ((ret = (*libc_fopen)([nsspath cStringUsingEncoding:NSUTF8StringEncoding],mode)) == NULL &&
+	    mode[0]=='w' &&
+	    ((filename[0]!='.' && filename[0]!='/') ||
+	     (filename[0]=='.' && filename[1]=='/')) ) {
+	    /*
+	     * If not present in Bundle, create directory in Documents
+	     */
+	    char *path = strdup(filename), *slash;
+	    static int once = 1;
+
+	    if ((slash = mkdirhier(path)) && once) {
+		/*
+		 * For some reason iOS truncates first created file
+		 * upon program exit, so we create one preemptively...
+		 */
+		once = 0;
+		strcpy(slash,"/.0");
+		creat(path,0444);
+	    }
+	    free(path);
+	    ret = (*libc_fopen)(filename,mode);
+	}
+    }
+
+    return ret;
+}
diff --git a/iOS/incore_macho.c b/iOS/incore_macho.c
new file mode 100644
index 0000000000..8842764cb0
--- /dev/null
+++ b/iOS/incore_macho.c
@@ -0,0 +1,1016 @@
+/* incore_macho.c */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* ====================================================================
+ * Copyright 2011 Thursby Software Systems, Inc. All rights reserved.
+ *
+ * The portions of the attached software ("Contribution") is developed by
+ * Thursby Software Systems, Inc and is licensed pursuant to the OpenSSL
+ * open source license.
+ *
+ * The Contribution, originally written by Paul W. Nelson of
+ * Thursby Software Systems, Inc, consists of the fingerprint calculation
+ * required for the FIPS140 integrity check.
+ *
+ * No patent licenses or other rights except those expressly stated in
+ * the OpenSSL open source license shall be deemed granted or received
+ * expressly, by implication, estoppel, or otherwise.
+ *
+ * No assurances are provided by Thursby that the Contribution does not
+ * infringe the patent or other intellectual property rights of any third
+ * party or that the license provides you with all the necessary rights
+ * to make use of the Contribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. IN
+ * ADDITION TO THE DISCLAIMERS INCLUDED IN THE LICENSE, THURSBY
+ * SPECIFICALLY DISCLAIMS ANY LIABILITY FOR CLAIMS BROUGHT BY YOU OR ANY
+ * OTHER ENTITY BASED ON INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS OR
+ * OTHERWISE.
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <mach-o/loader.h>
+#include <mach-o/nlist.h>
+#include <mach-o/stab.h>
+#include <mach-o/reloc.h>
+#include <mach-o/fat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/vmparam.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/hmac.h>
+#include <openssl/fips.h>
+
+#ifndef CPU_SUBRTPE_V7F
+# define CPU_SUBRTPE_V7F	((cpu_subtype_t) 10)
+#endif
+/* iPhone 5 and iPad 4 (A6 Processors) */
+#ifndef CPU_SUBTYPE_ARM_V7S
+# define CPU_SUBTYPE_ARM_V7S	((cpu_subtype_t) 11)
+#endif
+#ifndef CPU_SUBTYPE_ARM_V7K
+# define CPU_SUBTYPE_ARM_V7K	((cpu_subtype_t) 12)
+#endif
+#ifndef CPU_SUBTYPE_ARM_V8
+# define CPU_SUBTYPE_ARM_V8	((cpu_subtype_t) 13)
+#endif
+
+#ifndef CPU_TYPE_ARM64
+# define CPU_TYPE_ARM64	(CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#endif
+
+static int gVerbosity = 0;
+
+static void hexdump(const unsigned char *buf,size_t len,
+                    unsigned long address,FILE* fp)
+{
+	unsigned long addr;
+	int i;
+	
+	addr = 0;
+	while(addr<len)
+	{
+		fprintf(fp,"%6.6lx - ",addr+address);
+		for(i=0;i<16;i++)
+		{
+			if(addr+i<len)
+				fprintf(fp,"%2.2x ",buf[addr+i]);
+			else
+				fprintf(fp,"   ");
+		}
+		fprintf(fp," \"");
+		for(i=0;i<16;i++)
+		{
+			if(addr+i<len)
+			{
+				if(isprint(buf[addr+i]) && (buf[addr+i]<0x7e) )
+					putc(buf[addr+i],fp);
+				else
+					putc('.',fp);
+			}
+		}
+		fprintf(fp,"\"\n");
+		addr += 16;
+	}
+	fflush(fp);
+}
+
+struct segment_rec;
+typedef struct section_rec {
+    char            sectname[16];
+    char            segname[16];
+    uint64_t        addr;
+    uint64_t        size;
+    uint32_t        offset;
+    uint32_t        align;
+    uint32_t        reloff;
+    uint32_t        nreloc;
+    uint32_t        flags;
+    struct segment_rec* segment;
+    struct section_rec* _next;
+} section_t;
+
+typedef struct segment_rec {
+    char            segname[16];
+    uint64_t        vmaddr;
+    uint64_t        vmsize;
+    off_t           fileoff;
+    uint64_t        filesize;
+    vm_prot_t       maxprot;
+    vm_prot_t       initprot;
+    uint32_t        nsects;
+    uint32_t        flags;
+    unsigned char*  mapped;
+    struct segment_rec* _next;
+} segment_t;
+
+typedef struct symtab_entry_rec {
+    uint32_t        n_strx;
+    uint8_t         n_type;
+    uint8_t         n_sect;
+    int16_t         n_desc;
+    uint64_t        n_value;
+    const char *    n_symbol;
+    section_t*      section;
+    unsigned char*  mapped;     /* pointer to the actual data in mapped file */
+    struct symtab_entry_rec*    _next;
+} symtab_entry_t;
+
+
+typedef struct macho_file_rec
+{
+    const char *    filename;
+    void*           mapped;
+    size_t          size;       /* number of valid bytes at 'mapped' */
+    uint32_t        align;      /* byte alignment for this arch */
+    int             isBigEndian;/* 1 if everything is byte swapped */
+    
+    cpu_type_t      cpu_type;
+    cpu_subtype_t   cpu_subtype;
+    section_t*      sec_head;
+    section_t*      sec_tail;
+    
+    segment_t*      seg_head;
+    segment_t*      seg_tail;
+    
+    symtab_entry_t* sym_head;
+    symtab_entry_t* sym_tail;
+    struct macho_file_rec   *next;
+    char *          fingerprint_computed;
+    char *          fingerprint_original;
+    
+} macho_file_t;
+
+static const char *cputype(cpu_type_t cputype, cpu_subtype_t subtype)
+{
+    const char *rval = "unknown";
+    switch( cputype )
+    {
+        case CPU_TYPE_I386: rval = "i386"; break;
+        case CPU_TYPE_X86_64: rval = "x86_64"; break;
+        case CPU_TYPE_ARM64: rval = "aarch64"; break;
+        case CPU_TYPE_ARM:
+        {
+            switch( subtype )
+            {
+                case CPU_SUBTYPE_ARM_V6:    rval = "armv6"; break;
+                case CPU_SUBTYPE_ARM_V7:    rval = "armv7"; break;
+                case CPU_SUBTYPE_ARM_V7S:   rval = "armv7s"; break;
+                case CPU_SUBTYPE_ARM_V7K:   rval = "armv7k"; break;
+                case CPU_SUBTYPE_ARM_V8:    rval = "armv8"; break;
+                default: rval = "arm"; break;
+            }
+        }
+    }
+    return rval;
+}
+
+static void *add_section( macho_file_t *macho, void *pCommand,
+                         uint8_t is64bit, struct segment_rec *segment )
+{
+    void* rval = 0;
+    uint32_t flags;
+    
+    section_t* sec = (section_t*)calloc(1, sizeof(section_t));
+    if(!sec) return NULL;
+    
+    if(is64bit)
+    {
+        struct section_64* pSec = (struct section_64*)pCommand;
+        flags = pSec->flags;
+        memcpy( sec->sectname, pSec->sectname, 16 );
+        memcpy( sec->segname, pSec->segname, 16 );
+        sec->addr = pSec->addr;
+        sec->size = pSec->size;
+        sec->offset = pSec->offset;
+        sec->align = pSec->align;
+        sec->reloff = pSec->reloff;
+        sec->nreloc = pSec->nreloc;
+        sec->flags = pSec->flags;
+        rval = pCommand + sizeof(struct section_64);
+    }
+    else
+    {
+        struct section* pSec = (struct section*)pCommand;
+        flags = pSec->flags;
+        memcpy( sec->sectname, pSec->sectname, 16 );
+        memcpy( sec->segname, pSec->segname, 16 );
+        sec->addr = pSec->addr;
+        sec->size = pSec->size;
+        sec->offset = pSec->offset;
+        sec->align = pSec->align;
+        sec->reloff = pSec->reloff;
+        sec->nreloc = pSec->nreloc;
+        sec->flags = pSec->flags;
+        rval = pCommand + sizeof(struct section);
+    }
+    if( gVerbosity > 2 )
+        fprintf(stderr, "  flags=%x\n", flags);
+    sec->segment = segment;
+    sec->_next = NULL;
+    if( macho->sec_head )
+        macho->sec_tail->_next = sec;
+    else
+        macho->sec_head = sec;
+    macho->sec_tail = sec;
+    return rval;
+}
+
+static section_t *lookup_section(macho_file_t* macho, uint32_t nsect)
+{
+    section_t *rval = macho->sec_head;
+    
+    if(nsect == 0) return NULL;
+    
+    while( rval != NULL && --nsect > 0 )
+        rval = rval->_next;
+    return rval;
+}
+
+static void *add_segment( macho_file_t *macho, void *pCommand, uint8_t is64bit )
+{
+    void *rval = 0;
+    segment_t *seg = (segment_t *)calloc(1, sizeof(segment_t));
+    
+    if(!seg)
+        return 0;
+    if(is64bit)
+    {
+        struct segment_command_64 *pSeg = (struct segment_command_64*)pCommand;
+        
+        memcpy( seg->segname, pSeg->segname, 16 );
+        seg->vmaddr = pSeg->vmaddr;
+        seg->vmsize = pSeg->vmsize;
+        seg->fileoff = pSeg->fileoff;
+        seg->filesize = pSeg->filesize;
+        seg->maxprot = pSeg->maxprot;
+        seg->initprot = pSeg->initprot;
+        seg->nsects = pSeg->nsects;
+        seg->flags = pSeg->flags;
+        rval = pCommand + sizeof(struct segment_command_64);
+    } else {
+        struct segment_command *pSeg = (struct segment_command*)pCommand;
+        
+        memcpy( seg->segname, pSeg->segname, 16 );
+        seg->vmaddr = pSeg->vmaddr;
+        seg->vmsize = pSeg->vmsize;
+        seg->fileoff = pSeg->fileoff;
+        seg->filesize = pSeg->filesize;
+        seg->maxprot = pSeg->maxprot;
+        seg->initprot = pSeg->initprot;
+        seg->nsects = pSeg->nsects;
+        seg->flags = pSeg->flags;
+        rval = pCommand + sizeof(struct segment_command);
+    }
+    seg->_next = NULL;
+    seg->mapped = macho->mapped + seg->fileoff;
+    
+    if( macho->seg_head )
+        macho->seg_tail->_next = seg;
+    else
+        macho->seg_head = seg;
+    macho->seg_tail = seg;
+    
+    if( gVerbosity > 2 )
+        fprintf(stderr, "Segment %s: flags=%x\n", seg->segname, seg->flags );
+    
+    unsigned int ii;
+    for( ii=0; ii<seg->nsects; ii++ )
+    {
+        rval = add_section(macho, rval, is64bit, seg);
+    }
+    return rval;
+}
+
+static const char *type_str(uint8_t n_type)
+{
+    static char result[16] = {};
+    int idx = 0;
+    uint8_t stab;
+    
+    memset(result, 0, sizeof(result));
+    if( n_type & N_PEXT )
+        result[idx++] = 'P';
+    if( n_type & N_EXT )
+        result[idx++] = 'E';
+    if( idx > 0 )
+        result[idx++] = ':';
+    switch( n_type & N_TYPE )
+    {
+        case N_UNDF: result[idx++] = 'U'; break;
+        case N_ABS: result[idx++] = 'A'; break;
+        case N_PBUD: result[idx++] = 'P'; break;
+        case N_SECT: result[idx++] = 'S'; break;
+        case N_INDR: result[idx++] = 'I'; break;
+        default: result[idx++] = '*'; break;
+    }
+    stab = n_type & N_STAB;
+    if( stab )
+    {
+        result[idx++] = ':';
+        result[idx++] = '0'+(stab >> 5);
+    }
+    result[idx++] = 0;
+    return result;
+}
+
+static symtab_entry_t *lookup_entry_by_name( macho_file_t *macho,
+                                            const char *name)
+{
+    symtab_entry_t *entry;
+    
+    for( entry = macho->sym_head; entry; entry = entry->_next )
+    {
+        if(strcmp(entry->n_symbol,name)==0 && (entry->n_type & N_STAB)==0 )
+        {
+            if( entry->section == NULL )
+            {
+                entry->section = lookup_section( macho, entry->n_sect );
+                if( entry->section )
+                {
+                    section_t* sec = entry->section;
+                    segment_t* seg = sec->segment;
+                    uint64_t offset = entry->n_value - seg->vmaddr;
+                    
+                    entry->mapped = seg->mapped+offset;
+                }
+                else
+                    entry = 0;
+            }
+            break;
+        }
+    }
+    return entry;
+}
+
+static void check_symtab(macho_file_t *macho,void *pCommand,uint8_t is64bit )
+{
+    
+    struct symtab_command *pSym = (struct symtab_command *)pCommand;
+    void *pS = macho->mapped + pSym->symoff;
+    unsigned int ii = 0;
+    
+    /* collect symbols */
+    for( ii=0; ii<pSym->nsyms; ii++ )
+    {
+        struct nlist *pnlist=(struct nlist*)pS;
+        symtab_entry_t *entry=(symtab_entry_t*)calloc(1,sizeof(symtab_entry_t));
+        
+        if(!entry)
+        {
+            fprintf(stderr, "out of memory!\n");
+            _exit(1);
+        }
+        entry->n_strx = pnlist->n_un.n_strx;
+        entry->n_type = pnlist->n_type;
+        entry->n_sect = pnlist->n_sect;
+        entry->n_desc = pnlist->n_desc;
+        entry->section = NULL;
+        if(is64bit)
+        {
+            struct nlist_64 *pnlist64 = (struct nlist_64*)pS;
+            
+            entry->n_value = pnlist64->n_value;
+            pS += sizeof(struct nlist_64);
+        }
+        else
+        {
+            entry->n_value = pnlist->n_value;
+            pS += sizeof(struct nlist);
+        }
+        entry->n_symbol=(const char *)macho->mapped+pSym->stroff+entry->n_strx;
+        entry->_next = NULL;
+        if( macho->sym_head )
+            macho->sym_tail->_next = entry;
+        else
+            macho->sym_head = entry;
+        macho->sym_tail = entry;
+    }
+    if( gVerbosity > 2 )
+    {
+        /* dump info */
+        symtab_entry_t* entry;
+        
+        for( entry = macho->sym_head; entry; entry=entry->_next )
+        {
+            /* only do non-debug symbols */
+            if( (entry->n_type & N_STAB) == 0 )
+                fprintf(stderr, "%32.32s %18llx type=%s, sect=%d\n",
+                        entry->n_symbol, entry->n_value,
+                        type_str(entry->n_type), entry->n_sect);
+        }
+    }
+}
+
+static int load_architecture( macho_file_t* inFile )
+{
+    /* check the header */
+    unsigned int ii;
+    void * pCurrent = inFile->mapped;
+    struct mach_header* header = (struct mach_header*)pCurrent;
+    
+    if( header->magic != MH_MAGIC && header->magic != MH_MAGIC_64 )
+    {
+        fprintf(stderr, "%s is not a mach-o file\n", inFile->filename);
+        return -1;
+    }
+    else if( header->filetype == MH_BUNDLE )
+    {
+        fprintf(stderr, "%s is not a mach-o executable file (filetype MH_BUNDLE, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename);
+        return -1;
+    }
+    else if( header->filetype == MH_DYLINKER )
+    {
+        fprintf(stderr, "%s is not a mach-o executable file (filetype MH_DYLINKER, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename);
+        return -1;
+    }
+    else if( !(header->filetype == MH_EXECUTE || header->filetype == MH_DYLIB) )
+    {
+        fprintf(stderr, "%s is not a mach-o executable file (filetype %d, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename, header->filetype);
+        return -1;
+    }
+    
+    if( gVerbosity > 1 )
+        fprintf(stderr, "loading %s(%s)\n", inFile->filename, cputype(header->cputype, header->cpusubtype));
+    
+    inFile->cpu_type = header->cputype;
+    inFile->cpu_subtype = header->cpusubtype;
+    
+    if( header->magic == MH_MAGIC )
+        pCurrent += sizeof( struct mach_header );
+    else if( header->magic == MH_MAGIC_64 )
+        pCurrent += sizeof( struct mach_header_64 );
+    for( ii=0; ii<header->ncmds; ii++ )
+    {
+        struct load_command* command = (struct load_command*)pCurrent;
+        const char * lc_name;
+        
+        switch( command->cmd )
+        {
+            case LC_SEGMENT:
+            {
+                lc_name = "LC_SEGMENT";
+                add_segment(inFile, pCurrent, header->magic == MH_MAGIC_64);
+                break;
+            }
+            case LC_SYMTAB:
+            {
+                lc_name = "LC_SYMTAB";
+                check_symtab(inFile, pCurrent, header->magic == MH_MAGIC_64 );
+                break;
+            }
+            case LC_SYMSEG: lc_name = "LC_SYMSEG"; break;
+            case LC_THREAD: lc_name = "LC_THREAD"; break;
+            case LC_UNIXTHREAD: lc_name = "LC_UNIXTHREAD"; break;
+            case LC_LOADFVMLIB: lc_name = "LC_LOADFVMLIB"; break;
+            case LC_IDFVMLIB: lc_name = "LC_IDFVMLIB"; break;
+            case LC_IDENT: lc_name = "LC_IDENT"; break;
+            case LC_FVMFILE: lc_name = "LC_FVMFILE"; break;
+            case LC_PREPAGE: lc_name = "LC_PREPAGE"; break;
+            case LC_DYSYMTAB: lc_name = "LC_DYSYMTAB"; break;
+            case LC_LOAD_DYLIB: lc_name = "LC_LOAD_DYLIB"; break;
+            case LC_ID_DYLIB: lc_name = "LC_ID_DYLIB"; break;
+            case LC_LOAD_DYLINKER: lc_name = "LC_LOAD_DYLINKER"; break;
+            case LC_ID_DYLINKER: lc_name = "LC_ID_DYLINKER"; break;
+            case LC_PREBOUND_DYLIB: lc_name = "LC_PREBOUND_DYLIB"; break;
+            case LC_ROUTINES: lc_name = "LC_ROUTINES"; break;
+            case LC_SUB_FRAMEWORK: lc_name = "LC_SUB_FRAMEWORK"; break;
+            case LC_SUB_UMBRELLA: lc_name = "LC_SUB_UMBRELLA"; break;
+            case LC_SUB_CLIENT: lc_name = "LC_SUB_CLIENT"; break;
+            case LC_SUB_LIBRARY: lc_name = "LC_SUB_LIBRARY"; break;
+            case LC_TWOLEVEL_HINTS: lc_name = "LC_TWOLEVEL_HINTS"; break;
+            case LC_PREBIND_CKSUM: lc_name = "LC_PREBIND_CKSUM"; break;
+            case LC_LOAD_WEAK_DYLIB: lc_name = "LC_LOAD_WEAK_DYLIB"; break;
+            case LC_SEGMENT_64:
+            {
+                lc_name = "LC_SEGMENT_64";
+                add_segment(inFile, pCurrent, TRUE);
+                break;
+            }
+            case LC_ROUTINES_64: lc_name = "LC_ROUTINES_64"; break;
+            case LC_UUID: lc_name = "LC_UUID"; break;
+            case LC_RPATH: lc_name = "LC_RPATH"; break;
+            case LC_CODE_SIGNATURE: lc_name = "LC_CODE_SIGNATURE"; break;
+            case LC_SEGMENT_SPLIT_INFO:
+                lc_name = "LC_SEGMENT_SPLIT_INFO"; break;
+            case LC_REEXPORT_DYLIB: lc_name = "LC_REEXPORT_DYLIB"; break;
+            case LC_LAZY_LOAD_DYLIB: lc_name = "LC_LAZY_LOAD_DYLIB"; break;
+            case LC_ENCRYPTION_INFO: lc_name = "LC_ENCRYPTION_INFO"; break;
+            case LC_DYLD_INFO: lc_name = "LC_DYLD_INFO"; break;
+            case LC_DYLD_INFO_ONLY: lc_name = "LC_DYLD_INFO_ONLY"; break;
+            case LC_LOAD_UPWARD_DYLIB: lc_name = "LC_LOAD_UPWARD_DYLIB"; break;
+            case LC_VERSION_MIN_MACOSX:
+                lc_name = "LC_VERSION_MIN_MACOSX"; break;
+            case LC_VERSION_MIN_IPHONEOS:
+                lc_name = "LC_VERSION_MIN_IPHONEOS"; break;
+            case LC_FUNCTION_STARTS: lc_name = "LC_FUNCTION_STARTS"; break;
+            case LC_DYLD_ENVIRONMENT: lc_name = "LC_DYLD_ENVIRONMENT"; break;
+            default: lc_name=NULL; break;
+        }
+        if( gVerbosity > 1 )
+        {
+            if(lc_name)
+                fprintf(stderr,"command %s: size=%d\n",lc_name,
+						command->cmdsize );
+            else
+                fprintf(stderr,"command %x, size=%d\n",command->cmd,
+						command->cmdsize);
+        }
+        pCurrent += command->cmdsize;
+    }
+    return 0;
+}
+
+#define HOSTORDER_VALUE(val) (isBigEndian ? OSSwapBigToHostInt32(val) : (val))
+
+static macho_file_t *load_file(macho_file_t *inFile)
+{
+    macho_file_t *rval = NULL;
+    void *pCurrent = inFile->mapped;
+    struct fat_header *fat = (struct fat_header *)pCurrent;
+    
+    if( fat->magic==FAT_MAGIC || fat->magic==FAT_CIGAM )
+    {
+        int isBigEndian = fat->magic == FAT_CIGAM;
+        unsigned int ii = 0;
+        struct fat_arch *pArch = NULL;
+        uint32_t nfat_arch = 0;
+        
+        pCurrent += sizeof(struct fat_header);
+        pArch = pCurrent;
+        nfat_arch = HOSTORDER_VALUE(fat->nfat_arch);
+        for( ii=0; ii<nfat_arch; ii++)
+        {
+            macho_file_t *archfile=(macho_file_t *)calloc(1,
+                                                          sizeof(macho_file_t));
+            if( archfile )
+            {
+                archfile->filename = strdup(inFile->filename);
+                archfile->mapped = inFile->mapped +
+                HOSTORDER_VALUE(pArch->offset);
+                archfile->size = HOSTORDER_VALUE(pArch->size);
+                archfile->align = HOSTORDER_VALUE(pArch->align);
+                archfile->isBigEndian = isBigEndian;
+                archfile->cpu_type = HOSTORDER_VALUE(pArch->cputype);
+                archfile->cpu_subtype = HOSTORDER_VALUE(pArch->cpusubtype);
+                if( load_architecture(archfile) == 0 )
+                {
+                    archfile->next = rval;
+                    rval = archfile;
+                }
+            }
+            else
+                return NULL;    /* no memory */
+            pArch++;
+        }
+    }
+    else
+    {
+        struct mach_header* header = (struct mach_header*)pCurrent;
+        
+        if( header->magic != MH_MAGIC && header->magic != MH_MAGIC_64 )
+        {
+            fprintf(stderr, "%s is not a mach-o file\n", inFile->filename);
+        }
+        else if( header->filetype == MH_BUNDLE )
+        {
+            fprintf(stderr, "%s is not a mach-o executable file "
+                    "(filetype MH_BUNDLE, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename);
+        }
+        else if( header->filetype == MH_DYLINKER )
+        {
+            fprintf(stderr, "%s is not a mach-o executable file "
+                    "(filetype MH_DYLINKER, should be MH_EXECUTE or MH_DYLIB)\n", inFile->filename);
+        }
+        else if( !(header->filetype == MH_EXECUTE || header->filetype == MH_DYLIB) )
+        {
+            fprintf(stderr, "%s is not a mach-o executable file "
+                    "(filetype %d should be MH_EXECUTE or MH_DYLIB)\n",
+                    inFile->filename, header->filetype );
+        }
+        if( load_architecture(inFile) == 0 )
+        {
+            inFile->next = 0;
+            rval = inFile;
+        }
+    }
+    return rval;
+}
+
+#define FIPS_SIGNATURE_SIZE 20
+#define FIPS_FINGERPRINT_SIZE 40
+
+static void debug_symbol( symtab_entry_t* sym )
+{
+    if( gVerbosity > 1 )
+    {
+        section_t* sec = sym->section;
+        segment_t* seg = sec->segment;
+        fprintf(stderr, "%-40.40s: %llx sect=%s, segment=%s prot=(%x->%x)\n",
+               	sym->n_symbol, sym->n_value, sec->sectname,
+                seg->segname, seg->initprot, seg->maxprot );
+    }
+}
+
+/*
+ * Minimalistic HMAC from fips_standalone_sha1.c
+ */
+static void hmac_init(SHA_CTX *md_ctx,SHA_CTX *o_ctx,
+		      const char *key)
+    {
+    size_t len=strlen(key);
+    int i;
+    unsigned char keymd[HMAC_MAX_MD_CBLOCK];
+    unsigned char pad[HMAC_MAX_MD_CBLOCK];
+
+    if (len > SHA_CBLOCK)
+	{
+	SHA1_Init(md_ctx);
+	SHA1_Update(md_ctx,key,len);
+	SHA1_Final(keymd,md_ctx);
+	len=20;
+	}
+    else
+	memcpy(keymd,key,len);
+    memset(&keymd[len],'\0',HMAC_MAX_MD_CBLOCK-len);
+
+    for(i=0 ; i < HMAC_MAX_MD_CBLOCK ; i++)
+	pad[i]=0x36^keymd[i];
+    SHA1_Init(md_ctx);
+    SHA1_Update(md_ctx,pad,SHA_CBLOCK);
+
+    for(i=0 ; i < HMAC_MAX_MD_CBLOCK ; i++)
+	pad[i]=0x5c^keymd[i];
+    SHA1_Init(o_ctx);
+    SHA1_Update(o_ctx,pad,SHA_CBLOCK);
+    }
+
+static void hmac_final(unsigned char *md,SHA_CTX *md_ctx,SHA_CTX *o_ctx)
+    {
+    unsigned char buf[20];
+
+    SHA1_Final(buf,md_ctx);
+    SHA1_Update(o_ctx,buf,sizeof buf);
+    SHA1_Final(md,o_ctx);
+    }
+
+static int fingerprint(macho_file_t* inFile, int addFingerprint)
+{
+    int rval = 0;
+    unsigned char signature[FIPS_SIGNATURE_SIZE];
+    char signature_string[FIPS_FINGERPRINT_SIZE+1];
+    unsigned int len = sizeof(signature);
+    const char *fingerprint = NULL;
+    int ii = 0;
+    
+#define LOOKUP_SYMBOL( symname, prot ) \
+  symtab_entry_t *symname = \
+  lookup_entry_by_name( inFile, "_" #symname ); \
+  if( ! symname ) { \
+    fprintf(stderr, "%s: Not a FIPS executable (" \
+    #symname " not found)\n", inFile->filename ); \
+    return -1;\
+  } \
+  if( (symname->section->segment->initprot & \
+    (PROT_READ|PROT_WRITE|PROT_EXEC)) != (prot) ) { \
+      fprintf(stderr, #symname \
+      " segment has the wrong protection.\n"); \
+      debug_symbol(symname);return -1;\
+  }
+    
+    LOOKUP_SYMBOL( FIPS_rodata_start, PROT_READ | PROT_EXEC );
+    LOOKUP_SYMBOL( FIPS_rodata_end, PROT_READ | PROT_EXEC );
+    LOOKUP_SYMBOL( FIPS_text_startX, PROT_READ | PROT_EXEC );
+    LOOKUP_SYMBOL( FIPS_text_endX, PROT_READ | PROT_EXEC );
+    LOOKUP_SYMBOL( FIPS_signature, PROT_WRITE | PROT_READ );
+    LOOKUP_SYMBOL( FINGERPRINT_ascii_value, PROT_READ | PROT_EXEC );
+    
+    if( gVerbosity > 1 )
+    {
+        debug_symbol( FIPS_rodata_start );
+        debug_symbol( FIPS_rodata_end );
+        debug_symbol( FIPS_text_startX );
+        debug_symbol( FIPS_text_endX );
+        debug_symbol( FIPS_signature );
+        debug_symbol( FINGERPRINT_ascii_value );
+        
+        fingerprint = (const char *)FINGERPRINT_ascii_value->mapped;
+        fprintf(stderr, "fingerprint: ");
+        for(ii=0; ii<40; ii++ )
+        {
+            if( fingerprint[ii] == 0 )
+                break;
+            putc(fingerprint[ii], stderr);
+        }
+        putc('\n', stderr);
+    }
+    
+    /* check for the prefix ? character */
+    {
+        const unsigned char * p1 = FIPS_text_startX->mapped;
+        const unsigned char * p2 = FIPS_text_endX->mapped;
+        const unsigned char * p3 = FIPS_rodata_start->mapped;
+        const unsigned char * p4 = FIPS_rodata_end->mapped;
+        static const char          FIPS_hmac_key[]="etaonrishdlcupfm";
+        SHA_CTX md_ctx,o_ctx;
+        
+	hmac_init(&md_ctx,&o_ctx,FIPS_hmac_key);
+        
+        if (p1<=p3 && p2>=p3)
+            p3=p1, p4=p2>p4?p2:p4, p1=NULL, p2=NULL;
+        else if (p3<=p1 && p4>=p1)
+            p3=p3, p4=p2>p4?p2:p4, p1=NULL, p2=NULL;
+        
+        if (p1) {
+            
+            SHA1_Update(&md_ctx,p1,(size_t)p2-(size_t)p1);
+        }
+        if (FIPS_signature->mapped>=p3 && FIPS_signature->mapped<p4)
+        {
+            /* "punch" hole */
+            SHA1_Update(&md_ctx,p3,(size_t)FIPS_signature-(size_t)p3);
+            p3 = FIPS_signature->mapped+FIPS_SIGNATURE_SIZE;
+            if (p3<p4) {
+                SHA1_Update(&md_ctx,p3,(size_t)p4-(size_t)p3);
+            }
+        }
+        else {
+            SHA1_Update(&md_ctx,p3,(size_t)p4-(size_t)p3);
+	}
+        
+        hmac_final(signature,&md_ctx,&o_ctx);
+        
+        {
+            char *pString = NULL;
+            unsigned int i = 0;
+            
+            memset( signature_string, 0, sizeof(signature_string));
+            pString = signature_string;
+            for (i=0;i<len;i++)
+            {
+                snprintf(pString, 3, "%02x",signature[i]);
+                pString+=2;
+            }
+            *pString = 0;
+        }
+    }
+    fingerprint = (char *)FINGERPRINT_ascii_value->mapped;
+    inFile->fingerprint_original = strndup(fingerprint,FIPS_FINGERPRINT_SIZE);
+    inFile->fingerprint_computed = strdup(signature_string);
+    
+    if( addFingerprint )
+    {
+        void *fp_page = NULL;
+        void *fp_end = NULL;
+        
+        if(strcmp(fingerprint,"?have to make sure this string is unique")!=0)
+        {
+            if (memcmp((char*)fingerprint, signature_string, FIPS_FINGERPRINT_SIZE)!=0)
+            {
+                fprintf(stderr,
+                        "%s(%s) original fingerprint incorrect: %s\n",
+                        inFile->filename,
+                        cputype(inFile->cpu_type, inFile->cpu_subtype),
+                        fingerprint);
+            }
+        }
+
+        fp_page = (void*)((uintptr_t)fingerprint & ~PAGE_MASK);
+        fp_end = (void*)((uintptr_t)(fingerprint+(PAGE_SIZE*2)) & ~PAGE_MASK);
+        if( mprotect( fp_page, fp_end-fp_page, PROT_READ|PROT_WRITE ) )
+        {
+            perror("Can't write the fingerprint - mprotect failed");
+            fprintf(stderr, "fp_page=%p, fp_end=%p, len=%ld\n",
+                    fp_page, fp_end, (size_t)(fp_end-fp_page));
+            rval = 1;
+        }
+        else
+        {
+            memcpy((char*)fingerprint, signature_string, FIPS_FINGERPRINT_SIZE);
+            if( msync(fp_page, (fp_end-fp_page), 0) )
+                perror("msync failed");
+        }
+        if( gVerbosity > 0 )
+            fprintf(stderr, "%s(%s) fingerprint: %s\n", inFile->filename,
+                    cputype(inFile->cpu_type,inFile->cpu_subtype),
+                    signature_string);
+    }
+    if( *fingerprint == '?' )
+    {
+        printf("%s(%s) has no fingerprint.\n", inFile->filename,
+               cputype(inFile->cpu_type, inFile->cpu_subtype));
+        rval = 2;
+    }
+    else if( strncmp( fingerprint, signature_string, FIPS_FINGERPRINT_SIZE) == 0 )
+    {
+        if( ! addFingerprint )
+            printf("%s(%s) fingerprint is correct: %s\n", inFile->filename,
+                   cputype(inFile->cpu_type, inFile->cpu_subtype),
+                   signature_string);
+    }
+    else
+    {
+        printf("%s(%s) fingerprint %.40s is not correct\n", inFile->filename,
+                   cputype(inFile->cpu_type,inFile->cpu_subtype), fingerprint);
+        printf("calculated: %s\n", signature_string);
+        rval = -1;
+    }
+    return rval;
+}
+
+static int make_fingerprint( const char * inApp, int addFingerprint )
+{
+    int rval = 1;
+    int appfd = -1;
+    if( addFingerprint )
+        appfd = open( inApp, O_RDWR );
+    if( appfd < 0 )
+    {
+        if( addFingerprint )
+            fprintf(stderr, "Can't modify %s. Verifying only.\n", inApp);
+        addFingerprint = 0;
+        appfd = open( inApp, O_RDONLY );
+    }
+    if( appfd >= 0 )
+    {
+        struct stat stbuf;
+        fstat(appfd, &stbuf);
+        void * pApp = mmap(0, (size_t)stbuf.st_size, PROT_READ,
+                           MAP_SHARED, appfd, (off_t)0);
+        if( pApp == MAP_FAILED )
+        {
+            perror(inApp);
+        }
+        else
+        {
+            macho_file_t theFile;
+            macho_file_t* architectures;
+            macho_file_t* pArchitecture;
+            
+            memset( &theFile, 0, sizeof(theFile) );
+            theFile.filename = inApp;
+            theFile.mapped = pApp;
+            architectures = load_file(&theFile);
+            for( pArchitecture = architectures; pArchitecture;
+                pArchitecture = pArchitecture->next )
+            {
+                rval = fingerprint(pArchitecture, addFingerprint);
+                if( rval && addFingerprint )
+                {
+                    printf("Failure\n");
+                    break;
+                }
+            }
+            if((rval==0) && addFingerprint)
+            {
+                printf("Fingerprint Stored\n");
+            }
+            munmap(pApp, (size_t)stbuf.st_size);
+        }
+        close(appfd);
+    }
+    else
+    {
+        fprintf(stderr, "Can't open %s\n", inApp );
+    }
+    return rval;
+}
+
+static void print_usage(const char * prog)
+{
+    fprintf(stderr, "usage:\n\t%s [--debug] [--quiet] [-exe|-dso|-dylib] executable\n", prog);
+    _exit(1);
+}
+
+int main (int argc, const char * argv[])
+{
+    const char * pname = argv[0];
+    const char * filename = NULL;
+    int addFingerprint = 1;
+    const char * verbose_env = getenv("FIPS_SIG_VERBOSE");
+    
+    if( verbose_env )
+        gVerbosity = atoi(verbose_env);
+    
+    if( gVerbosity < 0 )
+        gVerbosity = 1;
+    
+    while( --argc )
+    {
+        ++argv;
+        if( strcmp(*argv,"-exe")==0 || strcmp(*argv,"--exe")==0 ||
+            strcmp(*argv,"-dso")==0 || strcmp(*argv,"--dso")==0 ||
+            strcmp(*argv,"-dylib")==0 || strcmp(*argv,"--dylib")==0 ||
+            strcmp(*argv,"--verify")==0 )
+        {
+            if(strcmp(*argv,"--verify")==0)
+                addFingerprint=0;
+
+            if( argc > 0 )
+            {
+                filename = *++argv;
+                argc--;
+            }
+        }
+        else if(strcmp(*argv,"-d")==0 || strcmp(*argv,"-debug")==0 || strcmp(*argv,"--debug")==0)
+        {
+            if( gVerbosity < 2 )
+                gVerbosity = 2;
+            else
+                gVerbosity++;
+        }
+        else if(strcmp(*argv,"-q")==0 || strcmp(*argv,"-quiet")==0 || strcmp(*argv,"--quiet")==0)
+            gVerbosity = 0;
+        else if(strncmp(*argv,"-",1)!=0) {
+            filename = *argv;
+        }
+    }
+    
+    if( !filename )
+    {
+        print_usage(pname);
+        return 1;
+    }
+
+    if( access(filename, R_OK) )
+    {
+        fprintf(stderr, "Can't access %s\n", filename);
+        return 1;
+    }
+
+    return make_fingerprint( filename, addFingerprint );
+}
+
diff --git a/ms/do_fips.bat b/ms/do_fips.bat
index 73b0a3e8e4..357f8fc76e 100644
--- a/ms/do_fips.bat
+++ b/ms/do_fips.bat
@@ -1,7 +1,10 @@
-@echo off
+rem @echo off
 
 SET ASM=%1
 SET EXARG=
+SET MFILE=ntdll.mak
+
+if NOT X%OSVERSION% == X goto wince
 
 if NOT X%PROCESSOR_ARCHITECTURE% == X goto defined 
 
@@ -42,6 +45,14 @@ SET TARGET=VC-WIN64A
 if x%ASM% == xno-asm goto compile
 SET ASM=nasm
 
+goto compile
+
+:wince
+
+echo Auto Configuring for WinCE
+SET TARGET=VC-CE
+SET MFILE=cedll.mak
+
 :compile
 
 if x%ASM% == xno-asm SET EXARG=no-asm
@@ -52,13 +63,13 @@ echo on
 
 perl util\mkfiles.pl >MINFO
 @if ERRORLEVEL 1 goto error
-perl util\mk1mf.pl dll %ASM% %TARGET% >ms\ntdll.mak
+perl util\mk1mf.pl dll %ASM% %TARGET% >ms\%MFILE%
 @if ERRORLEVEL 1 goto error
 
-nmake -f ms\ntdll.mak clean
-nmake -f ms\ntdll.mak
+nmake -f ms\%MFILE% clean
+nmake -f ms\%MFILE%
 @if ERRORLEVEL 1 goto error
-nmake -f ms\ntdll.mak install
+nmake -f ms\%MFILE% install
 @if ERRORLEVEL 1 goto error
 
 @echo.
diff --git a/test/Makefile b/test/Makefile
index 2577d245b9..3f9770663b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -12,6 +12,7 @@ PERL=		perl
 # KRB5 stuff
 KRB5_INCLUDES=
 LIBKRB5=
+TEST=		fips_algvs.c
 
 PEX_LIBS=
 EX_LIBS= #-lnsl -lsocket
@@ -81,6 +82,7 @@ FIPS_ECDHVS=	fips_ecdhvs
 FIPS_ECDSAVS=	fips_ecdsavs
 FIPS_TEST_SUITE=fips_test_suite
 FIPS_CMACTEST=	fips_cmactest
+FIPS_ALGVS=	fips_algvs
 
 TESTS=		alltests
 
@@ -119,7 +121,7 @@ OBJ=	$(BNTEST).o $(ECTEST).o  $(ECDSATEST).o $(ECDHTEST).o $(IDEATEST).o \
 	$(FIPS_RSASTEST).o $(FIPS_RSAGTEST).o $(FIPS_GCMTEST).o \
 	$(FIPS_DSSVS).o $(FIPS_DSATEST).o $(FIPS_RNGVS).o $(FIPS_DRBGVS).o \
 	$(FIPS_TEST_SUITE).o $(FIPS_DHVS).o $(FIPS_ECDSAVS).o \
-	$(FIPS_ECDHVS).o $(FIPS_CMACTEST).o \
+	$(FIPS_ECDHVS).o $(FIPS_CMACTEST).o $(FIPS_ALGVS).o \
 	$(EVPTEST).o $(IGETEST).o $(JPAKETEST).o
 SRC=	$(BNTEST).c $(ECTEST).c  $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \
 	$(MD2TEST).c  $(MD4TEST).c $(MD5TEST).c \
@@ -133,7 +135,7 @@ SRC=	$(BNTEST).c $(ECTEST).c  $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \
 	$(FIPS_RSASTEST).c $(FIPS_RSAGTEST).c $(FIPS_GCMTEST).c \
 	$(FIPS_DSSVS).c $(FIPS_DSATEST).c $(FIPS_RNGVS).c $(FIPS_DRBGVS).c \
 	$(FIPS_TEST_SUITE).c $(FIPS_DHVS).c $(FIPS_ECDSAVS).c \
-	$(FIPS_ECDHVS).c $(FIPS_CMACTEST).c \
+	$(FIPS_ECDHVS).c $(FIPS_CMACTEST).c $(FIPS_ALGVS).c \
 	$(EVPTEST).c $(IGETEST).c $(JPAKETEST).c
 
 EXHEADER= 
@@ -150,6 +152,8 @@ exe:	$(EXE) $(FIPSEXE) dummytest$(EXE_EXT)
 
 fipsexe:	$(FIPSEXE)
 
+fipsalgvs:	$(FIPS_ALGVS)
+
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
@@ -504,6 +508,9 @@ $(FIPS_TEST_SUITE)$(EXE_EXT): $(FIPS_TEST_SUITE).o $(DLIBCRYPTO)
 $(FIPS_CMACTEST)$(EXE_EXT): $(FIPS_CMACTEST).o $(DLIBCRYPTO)
 	@target=$(FIPS_CMACTEST); $(FIPS_BUILD_CMD)
 
+$(FIPS_ALGVS)$(EXE_EXT): $(FIPS_ALGVS).o $(DLIBCRYPTO)
+	@target=$(FIPS_ALGVS); $(FIPS_BUILD_CMD)
+
 $(RMDTEST)$(EXE_EXT): $(RMDTEST).o $(DLIBCRYPTO)
 	@target=$(RMDTEST); $(BUILD_CMD)
 
diff --git a/test/fips_algvs.c b/test/fips_algvs.c
new file mode 100644
index 0000000000..2bfd213a0e
--- /dev/null
+++ b/test/fips_algvs.c
@@ -0,0 +1,428 @@
+/* test/fips_algvs.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2011
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include <openssl/opensslconf.h>
+
+#ifndef OPENSSL_FIPS
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+    printf("No FIPS ALGVS support\n");
+    return 0;
+}
+#else
+
+#if defined(__vxworks)
+
+#include <taskLibCommon.h>
+#include <string.h>
+
+int fips_algvs_main(int argc, char **argv);
+#define main fips_algvs_main
+
+static int fips_algvs_argv(char *a0)
+{
+	char *argv[32] = { "fips_algvs" };
+	int argc = 1;
+	int main_ret;
+
+	if (a0) {
+		char *scan = a0, *arg = a0;
+
+		while (*scan) {
+			if (*scan++ == ' ') {
+				scan[-1] = '\0';
+				argv[argc++] = arg;
+				if (argc == (sizeof(argv)/sizeof(argv[0])-1))
+					break;
+
+				while (*scan == ' ') scan++;
+				arg = scan;
+			}
+		}
+		if (*scan == '\0') argv[argc++] = arg;
+	}
+
+	argv[argc] = NULL;
+
+	main_ret = fips_algvs_main(argc, argv);
+
+	if (a0) free(a0);
+
+	return main_ret;
+}
+
+int fips_algvs(int a0)
+{
+	return taskSpawn("fips_algvs", 100, (VX_FP_TASK | VX_SPE_TASK), 100000,
+			(FUNCPTR)fips_algvs_argv,
+			a0 ? strdup(a0) : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static FILE *fips_fopen(const char *path, const char *mode)
+{
+	char fips_path [256];
+
+	if (path[0] != '/' && strlen(path) < (sizeof(fips_path)-8)) {
+		strcpy(fips_path,"/fips0/");
+		strcat(fips_path,path);
+		return fopen(fips_path,mode);
+	}
+	return fopen(path,mode);
+}
+#define fopen fips_fopen
+#endif
+
+#define FIPS_ALGVS
+
+extern int fips_aesavs_main(int argc, char **argv);
+extern int fips_cmactest_main(int argc, char **argv);
+extern int fips_desmovs_main(int argc, char **argv);
+extern int fips_dhvs_main(int argc, char **argv);
+extern int fips_drbgvs_main(int argc,char **argv);
+extern int fips_dssvs_main(int argc, char **argv);
+extern int fips_ecdhvs_main(int argc, char **argv);
+extern int fips_ecdsavs_main(int argc, char **argv);
+extern int fips_gcmtest_main(int argc, char **argv);
+extern int fips_hmactest_main(int argc, char **argv);
+extern int fips_rngvs_main(int argc, char **argv);
+extern int fips_rsagtest_main(int argc, char **argv);
+extern int fips_rsastest_main(int argc, char **argv);
+extern int fips_rsavtest_main(int argc, char **argv);
+extern int fips_shatest_main(int argc, char **argv);
+extern int fips_test_suite_main(int argc, char **argv);
+
+#if !defined(_TMS320C6400_PLUS) && !defined(_TMS320C6400)
+#include "fips_aesavs.c"
+#include "fips_cmactest.c"
+#include "fips_desmovs.c"
+#include "fips_dhvs.c"
+#include "fips_drbgvs.c"
+#include "fips_dssvs.c"
+#include "fips_ecdhvs.c"
+#include "fips_ecdsavs.c"
+#include "fips_gcmtest.c"
+#include "fips_hmactest.c"
+#include "fips_rngvs.c"
+#include "fips_rsagtest.c"
+#include "fips_rsastest.c"
+#include "fips_rsavtest.c"
+#include "fips_shatest.c"
+#include "fips_test_suite.c"
+
+#else
+#include "aes/fips_aesavs.c"
+#include "cmac/fips_cmactest.c"
+#include "des/fips_desmovs.c"
+#include "dh/fips_dhvs.c"
+#include "rand/fips_drbgvs.c"
+#include "dsa/fips_dssvs.c"
+#include "ecdh/fips_ecdhvs.c"
+#include "ecdsa/fips_ecdsavs.c"
+#include "aes/fips_gcmtest.c"
+#include "hmac/fips_hmactest.c"
+#include "rand/fips_rngvs.c"
+#include "rsa/fips_rsagtest.c"
+#include "rsa/fips_rsastest.c"
+#include "rsa/fips_rsavtest.c"
+#include "sha/fips_shatest.c"
+#include "fips_test_suite.c"
+
+#pragma DATA_SECTION(aucCmBootDspLoad, "BootDspSection");
+volatile unsigned char aucCmBootDspLoad[8*1024];
+#endif
+
+typedef struct
+	{
+	const char *name;
+	int (*func)(int argc, char **argv);
+	} ALGVS_FUNCTION;
+
+static ALGVS_FUNCTION algvs[] = {
+	{"fips_aesavs", fips_aesavs_main}, 
+	{"fips_cmactest", fips_cmactest_main}, 
+	{"fips_desmovs", fips_desmovs_main}, 
+	{"fips_dhvs", fips_dhvs_main}, 
+	{"fips_drbgvs", fips_drbgvs_main}, 
+	{"fips_dssvs", fips_dssvs_main}, 
+	{"fips_ecdhvs", fips_ecdhvs_main}, 
+	{"fips_ecdsavs", fips_ecdsavs_main}, 
+	{"fips_gcmtest", fips_gcmtest_main}, 
+	{"fips_hmactest", fips_hmactest_main}, 
+	{"fips_rngvs", fips_rngvs_main}, 
+	{"fips_rsagtest", fips_rsagtest_main}, 
+	{"fips_rsastest", fips_rsastest_main}, 
+	{"fips_rsavtest", fips_rsavtest_main}, 
+	{"fips_shatest", fips_shatest_main}, 
+	{"fips_test_suite", fips_test_suite_main}, 
+	{NULL, 0}
+	};
+
+/* Argument parsing taken from apps/apps.c */
+
+typedef struct args_st
+	{
+	char **data;
+	int count;
+	} ARGS;
+
+static int chopup_args(ARGS *arg, char *buf, int *argc, char **argv[])
+	{
+	int num,i;
+	char *p;
+
+	*argc=0;
+	*argv=NULL;
+
+	i=0;
+	if (arg->count == 0)
+		{
+		arg->count=20;
+		arg->data=(char **)OPENSSL_malloc(sizeof(char *)*arg->count);
+		}
+	for (i=0; i<arg->count; i++)
+		arg->data[i]=NULL;
+
+	num=0;
+	p=buf;
+	for (;;)
+		{
+		/* first scan over white space */
+		if (!*p) break;
+		while (*p && ((*p == ' ') || (*p == '\t') || (*p == '\n')))
+			p++;
+		if (!*p) break;
+
+		/* The start of something good :-) */
+		if (num >= arg->count)
+			{
+			fprintf(stderr, "Too many arguments!!\n");
+			return 0;
+			}
+		arg->data[num++]=p;
+
+		/* now look for the end of this */
+		if ((*p == '\'') || (*p == '\"')) /* scan for closing quote */
+			{
+			i= *(p++);
+			arg->data[num-1]++; /* jump over quote */
+			while (*p && (*p != i))
+				p++;
+			*p='\0';
+			}
+		else
+			{
+			while (*p && ((*p != ' ') &&
+				(*p != '\t') && (*p != '\n')))
+				p++;
+
+			if (*p == '\0')
+				p--;
+			else
+				*p='\0';
+			}
+		p++;
+		}
+	*argc=num;
+	*argv=arg->data;
+	return(1);
+	}
+
+static int run_prg(int argc, char **argv)
+	{
+	ALGVS_FUNCTION *t;
+	const char *prg_name;
+	prg_name = strrchr(argv[0], '/');
+	if (prg_name)
+		prg_name++;
+	else
+		prg_name = argv[0];
+	for (t = algvs; t->name; t++)
+		{
+		if (!strcmp(prg_name, t->name))
+			return t->func(argc, argv);
+		}
+	return -100;
+	}
+
+int main(int argc, char **argv)
+	{
+	static char buf[1024];
+	char **args = argv + 1;
+	const char *sname = "fipstests.sh";
+	ARGS arg;
+	int xargc;
+	char **xargv;
+	int lineno = 0, badarg = 0;
+	int nerr = 0, quiet = 0, verbose = 0;
+	int rv;
+	FILE *in = NULL;
+#ifdef FIPS_ALGVS_MEMCHECK
+	CRYPTO_malloc_debug_init();
+	OPENSSL_init();
+	CRYPTO_set_mem_debug_options(V_CRYPTO_MDEBUG_ALL);
+	CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON);
+#endif
+
+#if defined(_TMS320C6400_PLUS)
+	SysInit();
+#endif
+
+#if (defined(__arm__) || defined(__aarch64__))
+	if (*args && !strcmp(*args, "-noaccel"))
+		{
+		extern unsigned int OPENSSL_armcap_P;
+
+		OPENSSL_armcap_P=0;
+		args++;
+		argc--;
+		}
+#endif
+	if (*args && *args[0] != '-')
+		{
+		rv = run_prg(argc - 1, args);
+#ifdef FIPS_ALGVS_MEMCHECK
+		CRYPTO_mem_leaks_fp(stderr);
+#endif
+		return rv;
+		}
+	while (!badarg && *args && *args[0] == '-')
+		{
+		if (!strcmp(*args, "-script"))
+			{
+			if (args[1])
+				{
+				args++;
+				sname = *args;
+				}
+			else
+				badarg = 1;
+			}
+		else if (!strcmp(*args, "-quiet"))
+			quiet = 1;
+		else if (!strcmp(*args, "-verbose"))
+			verbose = 1;
+		else
+			badarg = 1;
+		args++;
+		}
+
+	if (badarg)
+		{
+		fprintf(stderr, "Error processing arguments\n");
+		return 1;
+		}
+
+	in = fopen(sname, "r");
+	if (!in)
+		{
+		fprintf(stderr, "Error opening script file \"%s\"\n", sname);
+		return 1;
+		}
+
+	arg.data = NULL;
+	arg.count = 0;
+
+	while (fgets(buf, sizeof(buf), in))
+		{
+		lineno++;
+		if (!chopup_args(&arg, buf, &xargc, &xargv))
+			fprintf(stderr, "Error processing line %d\n", lineno);
+		else
+			{
+			if (!quiet)
+				{
+				int i;
+				int narg = verbose ? xargc : xargc - 2;
+				printf("Running command line:");
+				for (i = 0; i < narg; i++)
+					printf(" %s", xargv[i]);
+				printf("\n");
+				}
+			rv = run_prg(xargc, xargv);
+			if (FIPS_module_mode())
+				FIPS_module_mode_set(0, NULL);
+			if (rv != 0)
+				nerr++;
+			if (rv == -100)
+				fprintf(stderr, "ERROR: Command not found\n");
+			else if (rv != 0)
+				fprintf(stderr, "ERROR: returned %d\n", rv);
+			else if (verbose)
+				printf("\tCommand run successfully\n");
+			}
+		}
+
+	if (!quiet)
+		printf("Completed with %d errors\n", nerr);
+
+	if (arg.data)
+		OPENSSL_free(arg.data);
+
+	fclose(in);
+#ifdef FIPS_ALGVS_MEMCHECK
+	CRYPTO_mem_leaks_fp(stderr);
+#endif
+	if (nerr == 0)
+		return 0;
+	return 1;
+	}
+#endif
diff --git a/util/fips_standalone_sha1 b/util/fips_standalone_sha1
new file mode 100644
index 0000000000..ea2268cb4e
--- /dev/null
+++ b/util/fips_standalone_sha1
@@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+#
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+(!@ARV[0] && -f @ARGV[$#ARGV]) || die "usage: $0 [-verify] file";
+
+$verify=shift	if (@ARGV[0] eq "-verify");
+
+sysopen(FD,@ARGV[0],0) || die "$!";
+binmode(FD);
+
+my $ctx = HMAC->Init("etaonrishdlcupfm");
+
+while (read(FD,$blob,4*1024)) { $ctx->Update($blob); }
+
+close(FD);
+
+my $signature = unpack("H*",$ctx->Final());
+
+print "HMAC-SHA1(@ARGV[0])= $signature\n";
+
+if ($verify) {
+	open(FD,"<@ARGV[0].sha1") || die "$!";
+	$line = <FD>;
+	close(FD);
+	exit(0)	if ($line =~ /HMAC\-SHA1\([^\)]*\)=\s*([0-9a-f]+)/i &&
+				$1 eq $signature);
+	die "signature mismatch";
+}
diff --git a/util/fipsas.pl b/util/fipsas.pl
index fc2a759308..9dfe0d895c 100644
--- a/util/fipsas.pl
+++ b/util/fipsas.pl
@@ -37,32 +37,31 @@ while (<IN>)
 	last if (/assembler/)
 	}
 
-# Store all renames.
+# Store all renames [noting minimal length].
+my $minlen=0x10000;
 while (<IN>)
 	{
-	if (/^#define\s+(\w+)\s+(\w+)\b/)
+	if (/^#define\s+_?(\w+)\s+_?(\w+)\b/)
 		{
 		$edits{$1} = $2;
+		my $len = length($1);
+		$minlen = $len if ($len<$minlen);
 		}
 	}
 
-my ($from, $to);
+open(IN,"$target") || die "Can't open $target for reading";
 
-#rename target temporarily
-rename($target, "tmptarg.s") || die "Can't rename $target";
+@code = <IN>;	# suck in whole file
 
-#edit target
-open(IN,"tmptarg.s") || die "Can't open temporary file";
-open(OUT, ">$target") || die "Can't open output file $target";
+close IN;
 
-while (<IN>)
-{
-	while (($from, $to) = each %edits)
-		{
-		s/(\b_*)$from(\b)/$1$to$2/g;
-		}
-	print OUT $_;
-}
+open(OUT,">$target") || die "Can't open $target for writing";
+
+foreach $line (@code)
+	{
+	$line =~ s/\b(_?)(\w{$minlen,})\b/$1.($edits{$2} or $2)/geo;
+	print OUT $line;
+	}
 
 close OUT;
 
@@ -73,18 +72,5 @@ if ($runasm)
 
 	my $rv = $?;
 
-	# restore target
-	unlink $target;
-	rename "tmptarg.s", $target;
-
 	die "Error executing assembler!" if $rv != 0;
 	}
-else
-	{
-	# Don't care about target
-	unlink "tmptarg.s";
-	}
-
-
-
-
diff --git a/util/fipsdist.pl b/util/fipsdist.pl
index b191fbe41e..53f9d3e18a 100644
--- a/util/fipsdist.pl
+++ b/util/fipsdist.pl
@@ -58,7 +58,7 @@ while (<STDIN>)
 		}
 	else
 		{
-		next unless (/^(fips\/|crypto|util|test|include|ms)/);
+		next unless (/^(fips\/|crypto|util|test|include|ms|c6x)/);
 		}
 	if (/^crypto\/([^\/]+)/)
 		{
@@ -76,7 +76,7 @@ while (<STDIN>)
 		}
 	if (/^test\//)
 		{
-		next unless /Makefile/ || /dummytest.c/;
+		next unless /Makefile/ || /dummytest.c/ || /fips_algvs.c/ ;
 		}
 	print "$_\n";
 	}
diff --git a/util/fipslink.pl b/util/fipslink.pl
index 8b6fbad7d8..0f87f7dbc9 100644
--- a/util/fipslink.pl
+++ b/util/fipslink.pl
@@ -33,14 +33,24 @@ check_hash($sha1_exe, "fipscanister.lib");
 
 print "Integrity check OK\n";
 
-print "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c\n";
-system "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c";
-die "First stage Compile failure" if $? != 0;
+if (is_premain_linked(@ARGV)) {
+	print "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c\n";
+	system "$fips_cc $fips_cc_args $fips_libdir/fips_premain.c";
+	die "First stage Compile failure" if $? != 0;
+} elsif (!defined($ENV{FIPS_SIG})) {
+	die "no fips_premain.obj linked";
+}
 
 print "$fips_link @ARGV\n";
 system "$fips_link @ARGV";
 die "First stage Link failure" if $? != 0;
 
+if (defined($ENV{FIPS_SIG})) {
+	print "$ENV{FIPS_SIG} $fips_target\n";
+	system "$ENV{FIPS_SIG} $fips_target";
+	die "$ENV{FIPS_SIG} $fips_target failed" if $? != 0;
+	exit;
+}
 
 print "$fips_premain_dso $fips_target\n";
 system("$fips_premain_dso $fips_target >$fips_target.sha1");
@@ -57,11 +67,26 @@ print "$fips_cc -DHMAC_SHA1_SIG=\\\"$fips_hash\\\" $fips_cc_args $fips_libdir/fi
 system "$fips_cc -DHMAC_SHA1_SIG=\\\"$fips_hash\\\" $fips_cc_args $fips_libdir/fips_premain.c";
 die "Second stage Compile failure" if $? != 0;
 
-
 print "$fips_link @ARGV\n";
 system "$fips_link @ARGV";
 die "Second stage Link failure" if $? != 0;
 
+sub is_premain_linked
+	{
+	return 1 if (grep /fips_premain\.obj/,@_);
+	foreach (@_)
+		{
+		if (/^@(.*)/ && -f $1)
+			{
+			open FD,$1 or die "can't open $1";
+			my $ret = (grep /fips_premain\.obj/,<FD>)?1:0;
+			close FD;
+			return $ret;
+			}
+		}
+	return 0;
+	}
+
 sub check_hash
 	{
 	my ($sha1_exe, $filename) = @_;
diff --git a/util/hmac_sha1.pl b/util/hmac_sha1.pl
new file mode 100755
index 0000000000..494f7e8569
--- /dev/null
+++ b/util/hmac_sha1.pl
@@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2011 The OpenSSL Project.
+#
+######################################################################
+#
+# SHA1 and HMAC in Perl by <appro@openssl.org>.
+#
+{ package SHA1;
+  use integer;
+
+    {
+    ################################### SHA1 block code generator
+    my @V = ('$A','$B','$C','$D','$E');
+    my $i;
+
+    sub XUpdate {
+      my $ret;
+	$ret="(\$T=\$W[($i-16)%16]^\$W[($i-14)%16]^\$W[($i-8)%16]^\$W[($i-3)%16],\n\t";
+	if ((1<<31)<<1) {
+	    $ret.="    \$W[$i%16]=((\$T<<1)|(\$T>>31))&0xffffffff)\n\t  ";
+	} else {
+	    $ret.="    \$W[$i%16]=(\$T<<1)|((\$T>>31)&1))\n\t  ";
+	}
+    }
+    sub tail {
+      my ($a,$b,$c,$d,$e)=@V;
+      my $ret;
+	if ((1<<31)<<1) {
+	    $ret.="(($a<<5)|($a>>27));\n\t";
+	    $ret.="$b=($b<<30)|($b>>2);	$e&=0xffffffff;	#$b&=0xffffffff;\n\t";
+	} else {
+	    $ret.="(($a<<5)|($a>>27)&0x1f);\n\t";
+	    $ret.="$b=($b<<30)|($b>>2)&0x3fffffff;\n\t";
+	}
+      $ret;
+    }
+    sub BODY_00_15 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=\$W[$i]+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_16_19 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_20_39 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x6ed9eba1+($b^$c^$d)+".tail();
+    }
+    sub BODY_40_59 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x8f1bbcdc+(($b&$c)|(($b|$c)&$d))+".tail();
+    }
+    sub BODY_60_79 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0xca62c1d6+($b^$c^$d)+".tail();
+    }
+
+    my $sha1_impl =
+    'sub block {
+	my $self = @_[0];
+	my @W    = unpack("N16",@_[1]);
+	my ($A,$B,$C,$D,$E,$T) = @{$self->{H}};
+	';
+
+	$sha1_impl.='
+	$A &= 0xffffffff;
+	$B &= 0xffffffff;
+	' if ((1<<31)<<1);
+
+	for($i=0;$i<16;$i++){ $sha1_impl.=BODY_00_15(); unshift(@V,pop(@V)); }
+	for(;$i<20;$i++)    { $sha1_impl.=BODY_16_19(); unshift(@V,pop(@V)); }
+	for(;$i<40;$i++)    { $sha1_impl.=BODY_20_39(); unshift(@V,pop(@V)); }
+	for(;$i<60;$i++)    { $sha1_impl.=BODY_40_59(); unshift(@V,pop(@V)); }
+	for(;$i<80;$i++)    { $sha1_impl.=BODY_60_79(); unshift(@V,pop(@V)); }
+
+	$sha1_impl.='
+	$self->{H}[0]+=$A;	$self->{H}[1]+=$B;	$self->{H}[2]+=$C;
+	$self->{H}[3]+=$D;	$self->{H}[4]+=$E;	}';
+
+    #print $sha1_impl,"\n";
+    eval($sha1_impl);		# generate code
+    }
+
+    sub Init {
+	my $class = shift;	# multiple instances...
+	my $self  = {};
+
+	bless $self,$class;
+	$self->{H} = [0x67452301,0xefcdab89,0x98badcfe,0x10325476,0xc3d2e1f0];
+	$self->{N} = 0;
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	my $msg;
+
+	foreach $msg (@_) {
+	    my $len  = length($msg);
+	    my $num  = length($self->{buf});
+	    my $off  = 0;
+
+	    $self->{N} += $len;
+
+	    if (($num+$len)<64)
+	    {	$self->{buf} .= $msg; next;	}
+	    elsif ($num)
+	    {	$self->{buf} .= substr($msg,0,($off=64-$num));
+		$self->block($self->{buf});
+	    }
+
+	    while(($off+64) <= $len)
+	    {	$self->block(substr($msg,$off,64));
+		$off += 64;
+	    }
+
+	    $self->{buf} = substr($msg,$off);
+	}
+	return $self;
+    }
+
+    sub Final {
+	my $self = shift;
+	my $num  = length($self->{buf});
+
+	$self->{buf} .= chr(0x80); $num++;
+	if ($num>56)
+	{   $self->{buf} .= chr(0)x(64-$num);
+	    $self->block($self->{buf});
+	    $self->{buf}=undef;
+	    $num=0;
+	}
+	$self->{buf} .= chr(0)x(56-$num);
+	$self->{buf} .= pack("N2",($self->{N}>>29)&0x7,$self->{N}<<3);
+	$self->block($self->{buf});
+
+	return pack("N*",@{$self->{H}});
+    }
+
+    sub Selftest {
+	my $hash;
+
+	$hash=SHA1->Init()->Update('abc')->Final();
+	die "SHA1 test#1" if (unpack("H*",$hash) ne 'a9993e364706816aba3e25717850c26c9cd0d89d');
+
+	$hash=SHA1->Init()->Update('abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq')->Final();
+	die "SHA1 test#2" if (unpack("H*",$hash) ne '84983e441c3bd26ebaae4aa1f95129e5e54670f1');
+
+	#$hash=SHA1->Init()->Update('a'x1000000)->Final();
+	#die "SHA1 test#3" if (unpack("H*",$hash) ne '34aa973cd4c4daa4f61eeb2bdbad27316534016f');
+    }
+}
+
+{ package HMAC;
+
+    sub Init {
+	my $class = shift;
+	my $key   = shift;
+	my $self  = {};
+
+	bless $self,$class;
+
+	if (length($key)>64) {
+	    $key = SHA1->Init()->Update($key)->Final();
+	}
+	$key .= chr(0x00)x(64-length($key));
+
+	my @ikey = map($_^=0x36,unpack("C*",$key));
+	($self->{hash} = SHA1->Init())->Update(pack("C*",@ikey));
+	 $self->{okey} = pack("C*",map($_^=0x36^0x5c,@ikey));
+
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	$self->{hash}->Update(@_);
+	return $self;
+    }
+
+    sub Final {
+	my $self  = shift;
+	my $ihash = $self->{hash}->Final();
+	return SHA1->Init()->Update($self->{okey},$ihash)->Final();
+    }
+
+    sub Selftest {
+	my $hmac;
+
+	$hmac = HMAC->Init('0123456789:;<=>?@ABC')->Update('Sample #2')->Final();
+	die "HMAC test" if (unpack("H*",$hmac) ne '0922d3405faa3d194f82a45830737d5cc6c75d24');
+    }
+}
+
+1;
diff --git a/util/incore b/util/incore
index 883f63ff56..bb765b1966 100755
--- a/util/incore
+++ b/util/incore
@@ -34,6 +34,7 @@
 	@e_ident{magic,class,data,version,osabi,abiver,pad}=
 		unpack("a4C*",$elf);
 
+	$!=42;		# signal fipsld to revert to two-step link
 	die "not ELF file" if ($e_ident{magic} ne chr(0177)."ELF");
 
 	my $elf_bits   = $e_ident{class}*32;	# 32 or 64
@@ -377,11 +378,11 @@ $FIPS_text_endX		= $exe->Lookup("FIPS_text_endX");
 if (!$legacy_mode) {
     if (!$FIPS_text_startX || !$FIPS_text_endX) {
 	print STDERR "@ARGV[$#ARGV] is not cross-compiler aware.\n";
-	exit(1);
+	exit(42);	# signal fipsld to revert to two-step link
     }
 
     $FINGERPRINT_ascii_value
-			= $exe->Lookup("FINGERPRINT_ascii_value")	or die;
+			= $exe->Lookup("FINGERPRINT_ascii_value");
 
 }
 if ($FIPS_text_startX && $FIPS_text_endX) {
@@ -438,9 +439,12 @@ $fingerprint = FIPS_incore_fingerprint();
 
 if ($legacy_mode) {
     print unpack("H*",$fingerprint);
-} else {
+} elsif (defined($FINGERPRINT_ascii_value)) {
     seek(FD,$FINGERPRINT_ascii_value->{st_offset},0)	or die "$!";
     print FD unpack("H*",$fingerprint)			or die "$!";
+} else {
+    seek(FD,$FIPS_signature->{st_offset},0)		or die "$!";
+    print FD $fingerprint				or die "$!";
 }
 
 close (FD);
diff --git a/util/mk1mf.pl b/util/mk1mf.pl
index af039c78ac..5c4c50ab35 100755
--- a/util/mk1mf.pl
+++ b/util/mk1mf.pl
@@ -23,6 +23,7 @@ local $fips_canister_path = "";
 my $fips_premain_dso_exe_path = "";
 my $fips_premain_c_path = "";
 my $fips_sha1_exe_path = "";
+my $fips_sha1_exe_build = 1;
 
 local $fipscanisterbuild = 0;
 
@@ -248,6 +249,10 @@ elsif (($platform eq "netware-clib") || ($platform eq "netware-libc") ||
 	$BSDSOCK=1 if ($platform eq "netware-libc-bsdsock") || ($platform eq "netware-clib-bsdsock");
 	require 'netware.pl';
 	}
+elsif ($platform =~ /^c64x/)
+	{
+	require "TI_CGTOOLS.pl";
+	}
 else
 	{
 	require "unix.pl";
@@ -500,8 +505,16 @@ if ($fips)
 	{
 	if ($fips_sha1_exe_path eq "")
 		{
-		$fips_sha1_exe_path =
-			"\$(BIN_D)${o}fips_standalone_sha1$exep";
+		$fips_sha1_exe_path = $ENV{"FIPS_SHA1_PATH"};
+		if (defined $fips_sha1_exe_path)
+			{
+			$fips_sha1_exe_build = 0;
+			}
+		else
+			{
+			$fips_sha1_exe_path =
+				"\$(BIN_D)${o}fips_standalone_sha1$exep";
+			}
 		}
 	}
 	else
@@ -545,7 +558,7 @@ if ($fips)
 
 if ($fipscanisteronly)
 	{
-	$build_targets = "\$(O_FIPSCANISTER) \$(T_EXE)";
+	$build_targets = "\$(O_FIPSCANISTER)";
 	$libs_dep = "";
 	}
 
@@ -567,9 +580,14 @@ if ($fipscanisteronly)
 	\$(CP) \"fips${o}fips_premain.c.sha1\" \"\$(INSTALLTOP)${o}lib\"
 	\$(CP) \"\$(INCO_D)${o}fips.h\" \"\$(INSTALLTOP)${o}include${o}openssl\"
 	\$(CP) \"\$(INCO_D)${o}fips_rand.h\" \"\$(INSTALLTOP)${o}include${o}openssl\"
-	\$(CP) "\$(BIN_D)${o}fips_standalone_sha1$exep" \"\$(INSTALLTOP)${o}bin\"
 	\$(CP) \"util${o}fipslink.pl\" \"\$(INSTALLTOP)${o}bin\"
 EOF
+	if ($fips_sha1_exe_build)
+		{
+		$extra_install .= <<"EOF";
+	\$(CP) "\$(BIN_D)${o}fips_standalone_sha1$exep" \"\$(INSTALLTOP)${o}bin\"
+EOF
+		}
 	}
 elsif ($shlib)
 	{
@@ -716,7 +734,7 @@ LIBS_DEP=$libs_dep
 EOF
 
 $rules=<<"EOF";
-all: banner \$(TMP_D) \$(BIN_D) \$(TEST_D) \$(LIB_D) \$(INCO_D) headers \$(FIPS_SHA1_EXE) $build_targets
+all: banner \$(TMP_D) \$(BIN_D) \$(TEST_D) \$(LIB_D) \$(INCO_D) headers $build_targets
 
 banner:
 $banner
@@ -744,7 +762,11 @@ headers: \$(HEADER) \$(EXHEADER)
 
 lib: \$(LIBS_DEP) \$(E_SHLIB)
 
-exe: \$(T_EXE) \$(BIN_D)$o\$(E_EXE)$exep
+exe: \$(BIN_D)$o\$(E_EXE)$exep
+
+build_tests: \$(T_EXE)
+
+build_algvs: \$(T_SRC) \$(BIN_D)${o}fips_algvs$exep
 
 install: all
 	\$(MKDIR) \"\$(INSTALLTOP)\"
@@ -842,10 +864,13 @@ if ($fips)
 		}
 	$rules.=&cc_compile_target("\$(OBJ_D)${o}fips_standalone_sha1$obj",
 		"fips${o}sha${o}fips_standalone_sha1.c",
-		"\$(SHLIB_CFLAGS)");
+		"\$(APP_CFLAGS)");
 	$rules.=&cc_compile_target("\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj",
 		"fips${o}fips_premain.c",
-		"-DFINGERPRINT_PREMAIN_DSO_LOAD \$(SHLIB_CFLAGS)");
+		"-DFINGERPRINT_PREMAIN_DSO_LOAD \$(APP_CFLAGS)");
+	$rules.=&cc_compile_target("\$(OBJ_D)${o}fips_algvs$obj",
+		"test${o}fips_algvs.c",
+		"\$(APP_CFLAGS)");
 	}
 
 foreach (values %lib_nam)
@@ -878,6 +903,7 @@ EOF
 }
 
 $defs.=&do_defs("T_EXE",$test,"\$(TEST_D)",$exep);
+$defs.=&do_defs("T_SRC",$test,"\$(TMP_D)",".c");
 foreach (split(/\s+/,$test))
 	{
 	my $t_libs;
@@ -899,8 +925,11 @@ foreach (split(/\s+/,$test))
 
 	$tt="\$(OBJ_D)${o}$t${obj}";
 	$rules.=&do_link_rule("\$(TEST_D)$o$t$exep",$tt,"\$(LIBS_DEP)","$t_libs \$(EX_LIBS)", $ltype);
+	$rules.=&do_copy_rule("\$(TMP_D)",$_,".c");
 	}
 
+	$rules.=&do_link_rule("\$(TEST_D)${o}fips_algvs$exep","\$(OBJ_D)${o}fips_algvs$obj","\$(LIBS_DEP)","\$(O_FIPSCANISTER) \$(EX_LIBS)", 2) if $fips;
+
 $defs.=&do_defs("E_SHLIB",$engines . $otherlibs,"\$(ENG_D)",$shlibp);
 
 foreach (split(/\s+/,$engines))
@@ -955,20 +984,20 @@ if ($fips)
 					"\$(OBJ_D)${o}fips_start$obj",
 					"\$(FIPSOBJ)",
 					"\$(OBJ_D)${o}fips_end$obj",
-					"\$(FIPS_SHA1_EXE)", "");
+					"");
 		# FIXME
 		$rules.=&do_link_rule("\$(FIPS_SHA1_EXE)",
 					"\$(OBJ_D)${o}fips_standalone_sha1$obj \$(OBJ_D)${o}sha1dgst$obj $sha1_asm_obj",
-					"","\$(EX_LIBS)", 1);
+					"","\$(EX_LIBS)", 1) if $fips_sha1_exe_build;
 		}
 	else
 		{
 		$rules.=&do_link_rule("\$(FIPS_SHA1_EXE)",
 					"\$(OBJ_D)${o}fips_standalone_sha1$obj \$(O_FIPSCANISTER)",
-					"","", 1);
+					"","", 1) if $fips_sha1_exe_build;
 
 		}
-	$rules.=&do_link_rule("\$(PREMAIN_DSO_EXE)","\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj \$(CRYPTOOBJ) \$(O_FIPSCANISTER)","","\$(EX_LIBS)", 1);
+	$rules.=&do_link_rule("\$(PREMAIN_DSO_EXE)","\$(OBJ_D)${o}\$(E_PREMAIN_DSO)$obj \$(CRYPTOOBJ) \$(O_FIPSCANISTER)","","\$(EX_LIBS)", 1) unless defined $ENV{"FIPS_SIG"};
 	
 	}
 
@@ -1192,6 +1221,10 @@ sub do_compile_rule
 			{
 			$ret.=&Sasm_compile_target("$to${o}$n$obj",$s,$n);
 			}
+		elsif (-f ($s="${d}${o}asm${o}${n}.asm"))
+			{
+			$ret.=&cc_compile_target("$to${o}$n$obj","$s",$ex);
+			}
 		else	{ die "no rule for $_"; }
 		}
 	return($ret);
diff --git a/util/mklink.pl b/util/mklink.pl
index 61db12c68f..72a562ecaf 100755
--- a/util/mklink.pl
+++ b/util/mklink.pl
@@ -52,6 +52,7 @@ my $to = join('/', @to_path);
 my $file;
 $symlink_exists=eval {symlink("",""); 1};
 if ($^O eq "msys") { $symlink_exists=0 };
+if ($^O eq "MSWin32") { $symlink_exists=0 };
 foreach $file (@files) {
     my $err = "";
     if ($symlink_exists) {
diff --git a/util/msincore b/util/msincore
new file mode 100755
index 0000000000..08f81be8d5
--- /dev/null
+++ b/util/msincore
@@ -0,0 +1,169 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2012 The OpenSSL Project.
+#
+# The script embeds fingerprint into Microsoft PE-COFF executable object.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+######################################################################
+#
+# PE-COFF segment table parser by <appro@openssl.org>.
+#
+{ package PECOFF;
+  use FileHandle;
+
+    sub dup  { my %copy=map {$_} @_; return \%copy; }
+
+    sub Load {
+	my $class = shift;
+	my $self  = {};
+	my $FD    = FileHandle->new();	# autoclose
+	my $file  = shift;
+
+	bless $self,$class;
+
+	sysopen($FD,$file,0)		or die "$!";
+	binmode($FD);
+
+	#################################################
+	# read IMAGE_DOS_HEADER
+	#
+	read($FD,my $mz,64) or die "$!";
+	my @dos_header=unpack("a2C58V",$mz);
+
+	$!=42;		# signal fipsld to revert to two-step link
+	die "$file is not PE-COFF image" if (@dos_header[0] ne "MZ");
+
+	my $e_lfanew=pop(@dos_header);
+	seek($FD,$e_lfanew,0)		or die "$!";
+	read($FD,my $magic,4)		or die "$!";
+
+	$!=42;		# signal fipsld to revert to two-step link
+	die "$file is not PE-COFF image" if (unpack("V",$magic)!=0x4550);
+
+	#################################################
+	# read and parse COFF header...
+	#
+	read($FD,my $coff,20) or die "$!";
+
+	my %coff_header;
+	@coff_header{machine,nsects,date,syms_off,nsyms,opt,flags}=
+		unpack("v2V3v2",$coff);
+
+	my $strings;
+	my $symsize;
+
+	#################################################
+	# load strings table
+	#
+	if ($coff_header{syms_off}) {
+	    seek($FD,$coff_header{syms_off}+18*$coff_header{nsyms},0) or die "$!";
+	    read($FD,$strings,4) or die "$!";
+	    $symsize = unpack("V",$strings);
+	    read($FD,$strings,$symsize,4) or die "$!";
+	}
+
+	#################################################
+	# read sections
+	#
+	my $i;
+
+	# seek to section headers
+	seek($FD,$e_lfanew+24+@coff_header{opt},0) or die "$!";
+
+	for ($i=0;$i<$coff_header{nsects};$i++) {
+	    my %coff_shdr;
+	    my $name;
+
+	    read($FD,my $section,40) or die "$!";
+
+	    @coff_shdr{sh_name,sh_vsize,sh_vaddr,
+	    		sh_rawsize,sh_offset,sh_relocs,sh_lines,
+			sh_nrelocls,sh_nlines,sh_flags} =
+		unpack("a8V6v2V",$section);
+
+	    $name = $coff_shdr{sh_name};
+	    # see if sh_name is an offset in $strings
+	    my ($hi,$lo) = unpack("V2",$name);
+	    if ($hi==0 && $lo<$symsize) {
+		$name = substr($strings,$lo,64);
+	    }
+	    $name = (split(chr(0),$name))[0];
+	    $coff_shdr{sh_name} = $name;
+
+	    $self->{sections}{$name} = dup(%coff_shdr);
+	}
+
+	return $self;
+    }
+
+    sub Lookup {
+	my $self = shift;
+	my $name = shift;
+	return $self->{sections}{$name};
+    }
+}
+
+######################################################################
+#
+# main()
+#
+my $legacy_mode;
+
+if ($#ARGV<0 || ($#ARGV>0 && !($legacy_mode=(@ARGV[0] =~ /^\-(dso|exe)$/)))) {
+	print STDERR "usage: $0 [-dso|-exe] pe-coff-binary\n";
+	exit(1);
+}
+
+$exe = PECOFF->Load(@ARGV[$#ARGV]);
+
+sysopen(FD,@ARGV[$#ARGV],$legacy_mode?0:2) or die "$!";	# 2 is read/write
+binmode(FD);
+
+sub FIPS_incore_fingerprint {
+  my $ctx = HMAC->Init("etaonrishdlcupfm");
+  my ($beg,$end);
+  my $sect;
+
+    $sect = $exe->Lookup("fipstx")	or die "no fipstx section";
+
+    seek(FD,$sect->{sh_offset},0)	or die "$!";
+    read(FD,$blob,$sect->{sh_vsize})	or die "$!";
+
+    ($beg = index($blob,"SPIFxet_ts_tXtra")) >= 0
+					or die "no FIPS_text_startX";
+    ($end = rindex($blob,"SPIFxet_ne_t][Xd")) >= 0
+					or die "no FIPS_text_endX";
+
+    $ctx->Update(substr($blob,$beg,$end-$beg));
+
+    $sect = $exe->Lookup("fipsro")	or die "no fipsro section";
+
+    seek(FD,$sect->{sh_offset},0)	or die "$!";
+    read(FD,$blob,$sect->{sh_vsize})	or die "$!";
+
+    ($beg = index($blob,"SPIFdor__atarats",40)) >= 0
+					or die "no FIPS_rodata_start";
+    ($end = rindex($blob,"SPIFdor__ata[dne")) >= 0
+					or die "no FIPS_rodata_end";
+
+    $ctx->Update(substr($blob,$beg,$end-$beg));
+
+    return $ctx->Final();
+}
+
+$fingerprint = FIPS_incore_fingerprint();
+
+if ($legacy_mode) {
+    print unpack("H*",$fingerprint);
+} else {
+    my $sect = $exe->Lookup("fipsro");
+    seek(FD,$sect->{sh_offset},0)		or die "$!";
+    print FD unpack("H*",$fingerprint)		or die "$!";
+}
+
+close (FD);
diff --git a/util/pl/TI_CGTOOLS.pl b/util/pl/TI_CGTOOLS.pl
new file mode 100644
index 0000000000..d12d318062
--- /dev/null
+++ b/util/pl/TI_CGTOOLS.pl
@@ -0,0 +1,274 @@
+#!/usr/local/bin/perl
+#
+# TI_CGTOOLS.pl, Texas Instruments CGTOOLS under Unix or MSYS.
+#
+
+$ssl=	"ssl";
+$crypto="crypto";
+
+if ($fips && !$shlib)
+	{
+	$crypto="fips";
+	$crypto_compat = "cryptocompat.lib";
+	}
+else
+	{
+	$crypto="crypto";
+	}
+
+if ($fipscanisterbuild)
+	{
+	$fips_canister_path = "\$(LIB_D)/fipscanister.obj";
+	}
+
+$o='/';
+$cp='cp';
+$cp2='$(PERL) util/copy.pl -stripcr';
+$mkdir='$(PERL) util/mkdir-p.pl';
+$rm='rm -f';
+
+$zlib_lib="zlib1.lib";
+
+# Santize -L options for ms link
+$l_flags =~ s/-L("\[^"]+")/\/libpath:$1/g;
+$l_flags =~ s/-L(\S+)/\/libpath:$1/g;
+
+# C compiler stuff
+$cc='cl6x';
+$base_cflags= " $mf_cflag";
+my $f;
+$opt_cflags='';
+$dbg_cflags=$f.' -g -DDEBUG -D_DEBUG';
+$lflags='';
+
+*::cc_compile_target = sub {
+	my ($target,$source,$ex_flags)=@_;
+	my $ret;
+
+	$ex_flags.=" -DMK1MF_BUILD" if ($source =~/cversion/);
+	$ret ="$target: \$(SRC_D)$o$source\n\t";
+	if ($fipscanisterbuild && $source=~/\.asm$/) {
+		$ret.="\$(PERL) util${o}fipsas.pl . \$< norunasm \$(CFLAG)\n\t";
+	}
+	$ret.="\$(CC) --obj_directory=\$(OBJ_D) $ex_flags -c \$(SRC_D)$o$source\n";
+	$target =~ s/.*${o}([^${o}]+)/$1/;
+	$source =~ s/.*${o}([^${o}\.]+)\..*/$1${obj}/;
+	$ret.="\tmv \$(OBJ_D)${o}$source \$(OBJ_D)${o}$target\n" if ($target ne $source);
+	$ret.="\n";
+	return($ret);
+};
+*::perlasm_compile_target = sub {
+	my ($target,$source,$bname)=@_;
+	my $ret;
+
+	$bname =~ s/(.*)\.[^\.]$/$1/;
+	$ret=<<___;
+\$(TMP_D)$o$bname.asm: $source
+	\$(PERL) $source \$\@
+___
+	$ret .= "\t\$(PERL) util${o}fipsas.pl . \$@ norunasm \$(CFLAG)\n" if $fipscanisterbuild;
+
+	$ret.=<<___;
+
+$target: \$(TMP_D)$o$bname.asm
+	\$(ASM) --obj_directory=\$(OBJ_D) \$(TMP_D)$o$bname.asm
+
+___
+};
+
+$mlflags='';
+
+$out_def ="c6x";
+$tmp_def ="$out_def/tmp";
+$inc_def="$out_def/inc";
+
+if ($debug)
+	{
+	$cflags=$dbg_cflags.$base_cflags;
+	}
+else
+	{
+	$cflags=$opt_cflags.$base_cflags;
+	}
+
+$obj='.obj';
+$asm_suffix='.asm';
+$ofile="";
+
+# EXE linking stuff
+$link='$(CC) -z';
+$efile="-o ";
+$exep='.out';
+$ex_libs='';
+
+# static library stuff
+$mklib='ar6x';
+$ranlib='';
+$plib="";
+$libp=".lib";
+$shlibp=($shlib)?".dll":".lib";
+$lfile='-o ';
+
+$shlib_ex_obj="";
+$asm='$(CC) $(CFLAG) -c';
+
+$bn_asm_obj='';
+$bn_asm_src='';
+$des_enc_obj='';
+$des_enc_src='';
+$bf_enc_obj='';
+$bf_enc_src='';
+
+if (!$no_asm)
+	{
+	import_asm($mf_bn_asm, "bn", \$bn_asm_obj, \$bn_asm_src);
+	import_asm($mf_aes_asm, "aes", \$aes_asm_obj, \$aes_asm_src);
+	import_asm($mf_des_asm, "des", \$des_enc_obj, \$des_enc_src);
+	import_asm($mf_bf_asm, "bf", \$bf_enc_obj, \$bf_enc_src);
+	import_asm($mf_cast_asm, "cast", \$cast_enc_obj, \$cast_enc_src);
+	import_asm($mf_rc4_asm, "rc4", \$rc4_enc_obj, \$rc4_enc_src);
+	import_asm($mf_rc5_asm, "rc5", \$rc5_enc_obj, \$rc5_enc_src);
+	import_asm($mf_md5_asm, "md5", \$md5_asm_obj, \$md5_asm_src);
+	import_asm($mf_sha_asm, "sha", \$sha1_asm_obj, \$sha1_asm_src);
+	import_asm($mf_rmd_asm, "ripemd", \$rmd160_asm_obj, \$rmd160_asm_src);
+	import_asm($mf_wp_asm, "whrlpool", \$whirlpool_asm_obj, \$whirlpool_asm_src);
+	import_asm($mf_modes_asm, "modes", \$modes_asm_obj, \$modes_asm_src);
+	import_asm($mf_cpuid_asm, "", \$cpuid_asm_obj, \$cpuid_asm_src);
+	$perl_asm = 1;
+	}
+
+sub do_lib_rule
+	{
+	my($objs,$target,$name,$shlib,$ign,$base_addr) = @_;
+	local($ret);
+
+	$taget =~ s/\//$o/g if $o ne '/';
+	my $base_arg;
+	if ($base_addr ne "")
+		{
+		$base_arg= " /base:$base_addr";
+		}
+	else
+		{
+		$base_arg = "";
+		}
+	if ($name ne "")
+		{
+		$name =~ tr/a-z/A-Z/;
+		$name = "/def:ms/${name}.def";
+		}
+
+#	$target="\$(LIB_D)$o$target";
+#	$ret.="$target: $objs\n";
+	if (!$shlib)
+		{
+#		$ret.="\t\$(RM) \$(O_$Name)\n";
+		$ret.="$target: $objs\n";
+		$ret.="\t\$(MKLIB) $lfile$target $objs\n";
+		}
+	else
+		{
+		local($ex)=($target =~ /O_CRYPTO/)?'':' $(L_CRYPTO)';
+		$ex.=" $zlib_lib" if $zlib_opt == 1 && $target =~ /O_CRYPTO/;
+
+ 		if ($fips && $target =~ /O_CRYPTO/)
+			{
+			$ret.="$target: $objs \$(PREMAIN_DSO_EXE)";
+			$ret.="\n\tFIPS_LINK=\"\$(LINK)\" \\\n";
+			$ret.="\tFIPS_CC=\$(CC)\\\n";
+			$ret.="\tFIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\\\n";
+			$ret.="\tPREMAIN_DSO_EXE=\$(PREMAIN_DSO_EXE)\\\n";
+			$ret.="\tFIPS_SHA1_EXE=\$(FIPS_SHA1_EXE)\\\n";
+			$ret.="\tFIPS_TARGET=$target\\\n";
+			$ret.="\tFIPSLIB_D=\$(FIPSLIB_D)\\\n";
+			$ret.="\t\$(FIPSLINK) \$(MLFLAGS) /map $base_arg $efile$target ";
+			$ret.="$name \$(SHLIB_EX_OBJ) $objs \$(EX_LIBS) ";
+			$ret.="\$(OBJ_D)${o}fips_premain.obj $ex\n";
+			}
+		else
+			{
+			$ret.="$target: $objs";
+			$ret.="\n\t\$(LINK) \$(MLFLAGS) $efile$target $name \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n";
+			}
+
+		$ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;2\n\n";
+		}
+	$ret.="\n";
+	return($ret);
+	}
+
+sub do_link_rule
+	{
+	my($target,$files,$dep_libs,$libs,$standalone)=@_;
+	local($ret,$_);
+	$file =~ s/\//$o/g if $o ne '/';
+	$n=&bname($targer);
+	$ret.="$target: $files $dep_libs\n";
+	if ($standalone == 1)
+		{
+		$ret.="	\$(LINK) \$(LFLAGS) $efile$target ";
+		$ret.= "\$(EX_LIBS) " if ($files =~ /O_FIPSCANISTER/ && !$fipscanisterbuild);
+		$ret.="$files $libs\n";
+		}
+	elsif ($standalone == 2)
+		{
+		$ret.="\t\$(LINK) \$(LFLAGS) $efile$target $files \$(O_FIPSCANISTER) $out_def/application.cmd\n";
+		$ret.="\t$out_def/incore6x $target\n\n";
+		}
+	else
+		{
+		$ret.="\t\$(LINK) \$(LFLAGS) $efile$target ";
+		$ret.="\t\$(APP_EX_OBJ) $files $libs\n";
+		}
+	return($ret);
+	}
+
+sub do_rlink_rule
+	{
+	local($target,$rl_start, $rl_mid, $rl_end,$dep_libs,$libs)=@_;
+	local($ret,$_);
+	my $files = "$rl_start $rl_mid $rl_end";
+
+	$file =~ s/\//$o/g if $o ne '/';
+	$n=&bname($target);
+	$ret.="$target: $files $dep_libs\n";
+	$ret.="\t\$(LINK) -r $lfile$target $files $out_def/fipscanister.cmd\n";
+	$ret.="\t\$(PERL) $out_def${o}fips_standalone_sha1 $target > ${target}.sha1\n";
+	$ret.="\t\$(PERL) util${o}copy.pl -stripcr fips${o}fips_premain.c \$(LIB_D)${o}fips_premain.c\n";
+	$ret.="\t\$(CP) fips${o}fips_premain.c.sha1 \$(LIB_D)${o}fips_premain.c.sha1\n";
+	$ret.="\n";
+	return($ret);
+	}
+
+sub import_asm
+	{
+	my ($mf_var, $asm_name, $oref, $sref) = @_;
+	my $asm_dir;
+	if ($asm_name eq "")
+		{
+		$asm_dir = "crypto$o";
+		}
+	else
+		{
+		$asm_dir = "crypto$o$asm_name$oasm$o";
+		}
+
+	$$oref = "";
+	$$sref = "";
+	$mf_var =~ s/\.o//g;
+
+	foreach (split(/ /, $mf_var))
+		{
+		$$sref .= $asm_dir . $_ . ".asm ";
+		}
+	foreach (split(/ /, $mf_var))
+		{
+		$$oref .= "\$(TMP_D)\\" . $_ . ".obj ";
+		}
+	$$oref =~ s/ $//;
+	$$sref =~ s/ $//;
+
+	}
+
+
+1;
diff --git a/util/pl/VC-32.pl b/util/pl/VC-32.pl
index e98eb1e1b9..db9113fcbb 100644
--- a/util/pl/VC-32.pl
+++ b/util/pl/VC-32.pl
@@ -49,8 +49,7 @@ if ($FLAVOR =~ /WIN64/)
     # considered safe to ignore.
     # 
     $base_cflags= " $mf_cflag";
-    my $f = $shlib?' /MD':' /MT';
-    $lib_cflag='/Zl' if (!$shlib);	# remove /DEFAULTLIBs from static lib
+    my $f = ($shlib and !$fipscanisterbuild)?' /MD':' /MT';
     $opt_cflags=$f.' /Ox';
     $dbg_cflags=$f.'d /Od -DDEBUG -D_DEBUG';
     $lflags="/nologo /subsystem:console /opt:ref";
@@ -123,23 +122,28 @@ elsif ($FLAVOR =~ /CE/)
     }
 
     $cc='$(CC)';
-    $base_cflags=' /W3 /WX /GF /Gy /nologo -DUNICODE -D_UNICODE -DOPENSSL_SYSNAME_WINCE -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DDSO_WIN32 -DNO_CHMOD -DOPENSSL_SMALL_FOOTPRINT';
+    $base_cflags=' /W3 /GF /Gy /nologo -DUNICODE -D_UNICODE -DOPENSSL_SYSNAME_WINCE -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DDSO_WIN32 -DNO_CHMOD -DOPENSSL_SMALL_FOOTPRINT';
     $base_cflags.=" $wcecdefs";
     $base_cflags.=' -I$(WCECOMPAT)/include'		if (defined($ENV{'WCECOMPAT'}));
     $base_cflags.=' -I$(PORTSDK_LIBPATH)/../../include'	if (defined($ENV{'PORTSDK_LIBPATH'}));
-    $opt_cflags=' /MC /O1i';	# optimize for space, but with intrinsics...
-    $dbg_clfags=' /MC /Od -DDEBUG -D_DEBUG';
+    if (`cl 2>&1` =~ /Version 1[4-9]\./) {
+	$base_cflags.=($shlib and !$fipscanisterbuild)?' /MD':' /MT';
+    } else {
+	$base_cflags.=' /MC';
+    }
+    $opt_cflags=' /O1i';	# optimize for space, but with intrinsics...
+    $dbg_cflags=' /Od -DDEBUG -D_DEBUG';
     $lflags="/nologo /opt:ref $wcelflag";
     }
 else	# Win32
     {
     $base_cflags= " $mf_cflag";
-    my $f = $shlib?' /MD':' /MT';
-    $lib_cflag='/Zl' if (!$shlib);	# remove /DEFAULTLIBs from static lib
+    my $f = ($shlib and !$fipscanisterbuild)?' /MD':' /MT';
     $opt_cflags=$f.' /Ox /O2 /Ob2';
     $dbg_cflags=$f.'d /Od -DDEBUG -D_DEBUG';
     $lflags="/nologo /subsystem:console /opt:ref";
     }
+$lib_cflag='/Zl' if (!$shlib or $fipscanisterbuild);	# remove /DEFAULTLIBs
 $mlflags='';
 
 $out_def ="out32";	$out_def.="dll"			if ($shlib);
@@ -174,12 +178,12 @@ $rsc="rc";
 $efile="/out:";
 $exep='.exe';
 if ($no_sock)		{ $ex_libs=''; }
-elsif ($FLAVOR =~ /CE/)	{ $ex_libs='winsock.lib'; }
+elsif ($FLAVOR =~ /CE/)	{ $ex_libs='ws2.lib'; }
 else			{ $ex_libs='ws2_32.lib'; }
 
 if ($FLAVOR =~ /CE/)
 	{
-	$ex_libs.=' $(WCECOMPAT)/lib/wcecompatex.lib'	if (defined($ENV{'WCECOMPAT'}));
+	$ex_libs.=' $(WCECOMPAT)/lib/wcecompatex.lib crypt32.lib coredll.lib corelibc.lib'	if (defined($ENV{'WCECOMPAT'}));
 	$ex_libs.=' $(PORTSDK_LIBPATH)/portlib.lib'	if (defined($ENV{'PORTSDK_LIBPATH'}));
 	$ex_libs.=' /nodefaultlib:oldnames.lib coredll.lib corelibc.lib' if ($ENV{'TARGETCPU'} eq "X86");
 	}
@@ -284,7 +288,8 @@ elsif ($shlib && $FLAVOR =~ /CE/)
 	{
 	$mlflags.=" $lflags /dll";
 	$lflags.=' /entry:mainCRTstartup' if(defined($ENV{'PORTSDK_LIBPATH'}));
-	$lib_cflag.=" -D_WINDLL -D_DLL";
+	$lib_cflag.=" -D_WINDLL";
+	$lib_cflag.=" -D_DLL" if (!$fipscanisterbuild);
 	}
 
 sub do_lib_rule
@@ -389,8 +394,9 @@ sub do_rlink_rule
 
 	$file =~ s/\//$o/g if $o ne '/';
 	$n=&bname($targer);
-	$ret.="$target: $files $dep_libs \$(FIPS_SHA1_EXE)\n";
-	$ret.="\t\$(PERL) ms\\segrenam.pl \$\$a $rl_start\n";
+	$ret.="$target: $files $dep_libs";
+	$ret.=" \$(FIPS_SHA1_EXE)" unless defined $ENV{"FIPS_SHA1_PATH"};
+  	$ret.="\n\t\$(PERL) ms\\segrenam.pl \$\$a $rl_start\n";
 	$ret.="\t\$(PERL) ms\\segrenam.pl \$\$b $rl_mid\n";
 	$ret.="\t\$(PERL) ms\\segrenam.pl \$\$c $rl_end\n";
 	$ret.="\t\$(MKLIB) $lfile$target @<<\n\t$files\n<<\n";
diff --git a/util/point.sh b/util/point.sh
index da39899cb1..22daf0e8c5 100755
--- a/util/point.sh
+++ b/util/point.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 rm -f "$2"
-if test "$OSTYPE" = msdosdjgpp || test "x$PLATFORM" = xmingw ; then
+if test "$OSTYPE" = msdosdjgpp || test "x$PLATFORM" = xmingw || test "x$OS" = xWindows_NT ; then
     cp "$1" "$2"
 else
     ln -s "$1" "$2"