openssl/crypto/aes/asm/aest4-sparcv9.pl
Richard Levitte 1aa89a7a3a Unify all assembler file generators
They now generally conform to the following argument sequence:

    script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
              $(PROCESSOR) <output file>

However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file.  This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).

While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.

There's a perl lesson in this, regarding operator priority...

This will always succeed, even when it fails:

    open FOO, "something" || die "ERR: $!";

The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:

    open FOO, "something";

This, however, will fail if "something" can't be opened:

    open FOO, "something" or die "ERR: $!";

The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.

Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-16 16:29:57 +02:00

928 lines
23 KiB
Raku

#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by David S. Miller and Andy Polyakov.
# The module is licensed under 2-clause BSD license. October 2012.
# All rights reserved.
# ====================================================================
######################################################################
# AES for SPARC T4.
#
# AES round instructions complete in 3 cycles and can be issued every
# cycle. It means that round calculations should take 4*rounds cycles,
# because any given round instruction depends on result of *both*
# previous instructions:
#
# |0 |1 |2 |3 |4
# |01|01|01|
# |23|23|23|
# |01|01|...
# |23|...
#
# Provided that fxor [with IV] takes 3 cycles to complete, critical
# path length for CBC encrypt would be 3+4*rounds, or in other words
# it should process one byte in at least (3+4*rounds)/16 cycles. This
# estimate doesn't account for "collateral" instructions, such as
# fetching input from memory, xor-ing it with zero-round key and
# storing the result. Yet, *measured* performance [for data aligned
# at 64-bit boundary!] deviates from this equation by less than 0.5%:
#
# 128-bit key 192- 256-
# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
# (*) numbers after slash are for
# misaligned data.
#
# Out-of-order execution logic managed to fully overlap "collateral"
# instructions with those on critical path. Amazing!
#
# As with Intel AES-NI, question is if it's possible to improve
# performance of parallelizable modes by interleaving round
# instructions. Provided round instruction latency and throughput
# optimal interleave factor is 2. But can we expect 2x performance
# improvement? Well, as round instructions can be issued one per
# cycle, they don't saturate the 2-way issue pipeline and therefore
# there is room for "collateral" calculations... Yet, 2x speed-up
# over CBC encrypt remains unattaintable:
#
# 128-bit key 192- 256-
# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
# (*) numbers after slash are for
# misaligned data.
#
# Estimates based on amount of instructions under assumption that
# round instructions are not pairable with any other instruction
# suggest that latter is the actual case and pipeline runs
# underutilized. It should be noted that T4 out-of-order execution
# logic is so capable that performance gain from 2x interleave is
# not even impressive, ~7-13% over non-interleaved code, largest
# for 256-bit keys.
# To anchor to something else, software implementation processes
# one byte in 29 cycles with 128-bit key on same processor. Intel
# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
# in 0.93, naturally with AES-NI.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
$output = pop and open STDOUT,">$output";
$::evp=1; # if $evp is set to 0, script generates module with
# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
# points. These however are not fully compatible with openssl/aes.h,
# because they expect AES_KEY to be aligned at 64-bit boundary. When
# used through EVP, alignment is arranged at EVP layer. Second thing
# that is arranged by EVP is at least 32-bit alignment of IV.
######################################################################
# single-round subroutines
#
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.text
.globl aes_t4_encrypt
.align 32
aes_t4_encrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Lenc:
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_eround01 %f16, %f4, %f2, %f0
aes_eround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Lenc
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
aes_eround01_l %f16, %f4, %f2, %f0
aes_eround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_encrypt,#function
.size aes_t4_encrypt,.-aes_t4_encrypt
.globl aes_t4_decrypt
.align 32
aes_t4_decrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Ldec:
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_dround01 %f16, %f4, %f2, %f0
aes_dround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Ldec
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
aes_dround01_l %f16, %f4, %f2, %f0
aes_dround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_decrypt,#function
.size aes_t4_decrypt,.-aes_t4_decrypt
___
}
######################################################################
# key setup subroutines
#
{
my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
$code.=<<___;
.globl aes_t4_set_encrypt_key
.align 32
aes_t4_set_encrypt_key:
.Lset_encrypt_key:
and $inp, 7, $tmp
alignaddr $inp, %g0, $inp
cmp $bits, 192
ldd [$inp + 0], %f0
bl,pt %icc,.L128
ldd [$inp + 8], %f2
be,pt %icc,.L192
ldd [$inp + 16], %f4
brz,pt $tmp, .L256aligned
ldd [$inp + 24], %f6
ldd [$inp + 32], %f8
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
faligndata %f6, %f8, %f6
.L256aligned:
___
for ($i=0; $i<6; $i++) {
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
aes_kexpand0 %f4, %f2, %f4
std %f6, [$out + `32*$i+24`]
aes_kexpand2 %f6, %f4, %f6
___
}
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
std %f6, [$out + `32*$i+24`]
std %f0, [$out + `32*$i+32`]
std %f2, [$out + `32*$i+40`]
mov 14, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L192:
brz,pt $tmp, .L192aligned
nop
ldd [$inp + 24], %f6
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
.L192aligned:
___
for ($i=0; $i<7; $i++) {
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
aes_kexpand2 %f4, %f2, %f4
___
}
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
std %f0, [$out + `24*$i+24`]
std %f2, [$out + `24*$i+32`]
mov 12, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L128:
brz,pt $tmp, .L128aligned
nop
ldd [$inp + 16], %f4
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
.L128aligned:
___
for ($i=0; $i<10; $i++) {
$code.=<<___;
std %f0, [$out + `16*$i+0`]
aes_kexpand1 %f0, %f2, $i, %f0
std %f2, [$out + `16*$i+8`]
aes_kexpand2 %f2, %f0, %f2
___
}
$code.=<<___;
std %f0, [$out + `16*$i+0`]
std %f2, [$out + `16*$i+8`]
mov 10, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.type aes_t4_set_encrypt_key,#function
.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
.globl aes_t4_set_decrypt_key
.align 32
aes_t4_set_decrypt_key:
mov %o7, %o5
call .Lset_encrypt_key
nop
mov %o5, %o7
sll $tmp, 4, $inp ! $tmp is number of rounds
add $tmp, 2, $tmp
add $out, $inp, $inp ! $inp=$out+16*rounds
srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
.Lkey_flip:
ldd [$out + 0], %f0
ldd [$out + 8], %f2
ldd [$out + 16], %f4
ldd [$out + 24], %f6
ldd [$inp + 0], %f8
ldd [$inp + 8], %f10
ldd [$inp - 16], %f12
ldd [$inp - 8], %f14
sub $tmp, 1, $tmp
std %f0, [$inp + 0]
std %f2, [$inp + 8]
std %f4, [$inp - 16]
std %f6, [$inp - 8]
std %f8, [$out + 0]
std %f10, [$out + 8]
std %f12, [$out + 16]
std %f14, [$out + 24]
add $out, 32, $out
brnz $tmp, .Lkey_flip
sub $inp, 32, $inp
retl
xor %o0, %o0, %o0
.type aes_t4_set_decrypt_key,#function
.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
___
}
{{{
my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
$code.=<<___;
.align 32
_aes128_encrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f4
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01_l %f52, %f4, %f2, %f0
retl
aes_eround23_l %f54, %f4, %f2, %f2
.type _aes128_encrypt_1x,#function
.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
.align 32
_aes128_encrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f8
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01 %f48, %f4, %f6, %f10
aes_eround23 %f50, %f4, %f6, %f6
aes_eround01_l %f52, %f8, %f2, %f0
aes_eround23_l %f54, %f8, %f2, %f2
aes_eround01_l %f52, %f10, %f6, %f4
retl
aes_eround23_l %f54, %f10, %f6, %f6
.type _aes128_encrypt_2x,#function
.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
___
&alg_cbc_encrypt_implement("aes",128);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_xts_implement("aes",128,"en");
&alg_xts_implement("aes",128,"de");
}
&alg_cbc_decrypt_implement("aes",128);
$code.=<<___;
.align 32
_aes128_decrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f4
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01_l %f52, %f4, %f2, %f0
retl
aes_dround23_l %f54, %f4, %f2, %f2
.type _aes128_decrypt_1x,#function
.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
.align 32
_aes128_decrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f8
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01 %f48, %f4, %f6, %f10
aes_dround23 %f50, %f4, %f6, %f6
aes_dround01_l %f52, %f8, %f2, %f0
aes_dround23_l %f54, %f8, %f2, %f2
aes_dround01_l %f52, %f10, %f6, %f4
retl
aes_dround23_l %f54, %f10, %f6, %f6
.type _aes128_decrypt_2x,#function
.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
___
$code.=<<___;
.align 32
_aes192_encrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f4
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01_l %f60, %f4, %f2, %f0
retl
aes_eround23_l %f62, %f4, %f2, %f2
.type _aes192_encrypt_1x,#function
.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
.align 32
_aes192_encrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f8
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01 %f56, %f4, %f6, %f10
aes_eround23 %f58, %f4, %f6, %f6
aes_eround01_l %f60, %f8, %f2, %f0
aes_eround23_l %f62, %f8, %f2, %f2
aes_eround01_l %f60, %f10, %f6, %f4
retl
aes_eround23_l %f62, %f10, %f6, %f6
.type _aes192_encrypt_2x,#function
.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
.align 32
_aes256_encrypt_1x:
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f4, %f2, %f0
aes_eround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f4, %f2, %f0
aes_eround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_1x,#function
.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
.align 32
_aes256_encrypt_2x:
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f8, %f2, %f0
aes_eround23 %f22, %f8, %f2, %f2
aes_eround01 %f20, %f10, %f6, %f4
aes_eround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f8, %f2, %f0
aes_eround23_l %f22, %f8, %f2, %f2
aes_eround01_l %f20, %f10, %f6, %f4
aes_eround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_2x,#function
.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
.align 32
_aes192_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
___
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_encrypt_implement("aes",192);
if ($::evp) {
&alg_ctr32_implement("aes",256);
&alg_xts_implement("aes",256,"en");
&alg_xts_implement("aes",256,"de");
&alg_ctr32_implement("aes",192);
}
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
$code.=<<___;
.align 32
_aes256_decrypt_1x:
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f4, %f2, %f0
aes_dround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f4, %f2, %f0
aes_dround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_1x,#function
.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
.align 32
_aes256_decrypt_2x:
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f8, %f2, %f0
aes_dround23 %f22, %f8, %f2, %f2
aes_dround01 %f20, %f10, %f6, %f4
aes_dround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f8, %f2, %f0
aes_dround23_l %f22, %f8, %f2, %f2
aes_dround01_l %f20, %f10, %f6, %f4
aes_dround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_2x,#function
.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
___
}}}
if (!$::evp) {
$code.=<<___;
.global AES_encrypt
AES_encrypt=aes_t4_encrypt
.global AES_decrypt
AES_decrypt=aes_t4_decrypt
.global AES_set_encrypt_key
.align 32
AES_set_encrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_encrypt_key
nop
1: retl
nop
.type AES_set_encrypt_key,#function
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global AES_set_decrypt_key
.align 32
AES_set_decrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_decrypt_key
nop
1: retl
nop
.type AES_set_decrypt_key,#function
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
$code.=<<___;
.globl AES_cbc_encrypt
.align 32
AES_cbc_encrypt:
ld [$key + 240], %g1
nop
brz $enc, .Lcbc_decrypt
cmp %g1, 12
bl,pt %icc, aes128_t4_cbc_encrypt
nop
be,pn %icc, aes192_t4_cbc_encrypt
nop
ba aes256_t4_cbc_encrypt
nop
.Lcbc_decrypt:
bl,pt %icc, aes128_t4_cbc_decrypt
nop
be,pn %icc, aes192_t4_cbc_decrypt
nop
ba aes256_t4_cbc_decrypt
nop
.type AES_cbc_encrypt,#function
.size AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
$code.=<<___;
.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT;