openssl/crypto/modes/asm/ghash-armv4.pl
Andy Polyakov 874faf2ffb Adapt ARM assembly pack for iOS.
This is achieved by filtering perlasm output through arm-xlate.pl. But note
that it's done only if "flavour" argument is not 'void'. As 'void' is
default value for other ARM targets, permasm output is not actually
filtered on previously validated platforms.

Reviewed-by: Dr. Stephen Henson <steve@openssl.org>
2015-05-13 16:46:58 +02:00

446 lines
11 KiB
Prolog
Raw Blame History

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$Zll="r4"; # variables
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
################# r13 is stack pointer
$nhi="r14";
################# r15 is program counter
$rem_4bit=$inp; # used in gcm_gmult_4bit
$cnt=$len;
sub Zsmash() {
my $i=12;
my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) {
$code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $_,$_
str $_,[$Xi,#$i]
#elif defined(__ARMEB__)
str $_,[$Xi,#$i]
#else
mov $Tlh,$_,lsr#8
strb $_,[$Xi,#$i+3]
mov $Thl,$_,lsr#16
strb $Tlh,[$Xi,#$i+2]
mov $Thh,$_,lsr#24
strb $Thl,[$Xi,#$i+1]
strb $Thh,[$Xi,#$i]
#endif
___
$code.="\t".shift(@args)."\n";
$i-=4;
}
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#ifdef __APPLE__
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
.type rem_4bit,%object
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
.size rem_4bit,.-rem_4bit
.type rem_4bit_get,%function
rem_4bit_get:
sub $rem_4bit,pc,#8
sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
b .Lrem_4bit_got
nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
gcm_ghash_4bit:
sub r12,pc,#8
add $len,$inp,$len @ $len to point at the end
stmdb sp!,{r3-r11,lr} @ save $len/end too
sub r12,r12,#48 @ &rem_4bit
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
ldrb $nlo,[$inp,#15]
ldrb $nhi,[$Xi,#15]
.Louter:
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
ldrb $nhi,[$Xi,#14]
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Linner:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
bpl .Linner
ldr $len,[sp,#32] @ re-load $len/end
add $inp,$inp,#16
mov $nhi,$Zll
___
&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
$code.=<<___;
bne .Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
.type gcm_gmult_4bit,%function
gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr}
ldrb $nlo,[$Xi,#15]
b rem_4bit_get
.Lrem_4bit_got:
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
ldrb $nlo,[$Xi,#14]
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
and $nhi,$nlo,#0xf0
eor $Zhh,$Zhh,$Tll,lsl#16
and $nlo,$nlo,#0x0f
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
___
&Zsmash();
$code.=<<___;
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
my $cnt=$Htbl; # $Htbl is used once in the very beginning
my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
# in Zo. Or should I say "top bit", because GHASH is specified in
# reverse bit order? Otherwise straightforward 128-bt H by one input
# byte multiplication and modulo-reduction, times 16.
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
sub $Htbl,#16 @ point at H in GCM128_CTX
vld1.64 `&Dhi("$IN")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$IN")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Htbl,{$Hhi-$Hlo} @ load H
veor $zero,$zero
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Qpost,$Qpost
veor $R,$R
mov $cnt,#16
veor $Z,$Z
mov $len,#16
veor $Zo,$Zo
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
b .Linner_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 `&Dhi("$Z")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$Z")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Xi,{$Hhi-$Hlo} @ load H
veor $zero,$zero
nop
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
.Louter_neon:
vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
veor $Qpost,$Qpost
vld1.64 `&Dlo($IN)`,[$inp]!
veor $R,$R
mov $cnt,#16
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Zo,$Zo
veor $IN,$Z @ inp^=Xi
veor $Z,$Z
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
.Linner_neon:
subs $cnt,$cnt,#1
vmull.p8 $Qlo,$Hlo,$xi @ H.lo<EFBFBD>Xi[i]
vmull.p8 $Qhi,$Hhi,$xi @ H.hi<EFBFBD>Xi[i]
vext.8 $IN,$zero,#1 @ IN>>=8
veor $Z,$Qpost @ modulo-scheduled part
vshl.i64 `&Dlo("$R")`,#48
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
veor `&Dhi("$Z")`,`&Dlo("$R")`
vuzp.8 $Qlo,$Qhi
vsli.8 $Zo,$T,#1 @ compose the "carry" byte
vext.8 $Z,$zero,#1 @ Z>>=8
vmull.p8 $R,$Zo,$mod @ "carry"<EFBFBD>0xe1
vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
veor $Z,$Qhi
bne .Linner_neon
veor $Z,$Qpost @ modulo-scheduled artefact
vshl.i64 `&Dlo("$R")`,#48
veor `&Dhi("$Z")`,`&Dlo("$R")`
@ finalization, normalize Z:Zo
vand $Zo,$mod @ suffices to mask the bit
vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
vshl.i64 $Z,#1
subs $len,#16
vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
bne .Louter_neon
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
sub $Xi,#16
vst1.64 `&Dhi("$Z")`,[$Xi]! @ write out Xi
vst1.64 `&Dlo("$Z")`,[$Xi]
bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
print $code;
close STDOUT; # enforce flush