openssl/crypto/c64xcpuid.pl
Andy Polyakov 7d91d9ea6b Add some C64x assembly modules [by minor adjustments of C64x+ modules].
AES, SHA256 and SHA512 modules can actually replace corresponding
C64x+ modules. This is because C64x+ instructions don't actually
provide "killer-argument" advantage in these modules. As for SHA1,
even though its performance exactly same, C64x+ module is more
responsive to interrupts, i.e. doesn't inhibit them for as long
periods as C64x module.

Reviewed-by: Rich Salz <rsalz@openssl.org>
Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Stephen Henson <steve@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4265)

(cherry picked from commit 5526e5791f)
2017-08-30 21:26:43 +01:00

326 lines
6.5 KiB
Perl

#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg OPENSSL_rdtsc,_OPENSSL_rdtsc
.asg OPENSSL_cleanse,_OPENSSL_cleanse
.asg CRYPTO_memcmp,_CRYPTO_memcmp
.asg OPENSSL_atomic_add,_OPENSSL_atomic_add
.asg OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
.asg OPENSSL_instrument_bus,_OPENSSL_instrument_bus
.asg OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
.endif
.asg B3,RA
.asg 0x01AC0000,TIMER_BASE ; Timer 2
.global _OPENSSL_rdtsc
_OPENSSL_rdtsc:
.asmfunc
MVKL TIMER_BASE,A5
MVKH TIMER_BASE,A5
LDW *A5[0],A2 ; load CTL
LDW *A5[2],A4 ; load CTN
NOP 2
.if .BIG_ENDIAN
MVK 0x2c0,A7 ; internal clock source, don't hold, go
|| MVK -1,A6 ; maximum period
.else
MVK 0x2c0,A6 ; internal clock source, don't hold, go
|| MVK -1,A7 ; maximum period
.endif
[!A2] STDW A7:A6,*A5[0] ; fire it up
|| BNOP RA,5
.endasmfunc
.global _OPENSSL_cleanse
_OPENSSL_cleanse:
.asmfunc
ZERO A3:A2
|| ZERO B2
|| SHRU B4,3,B0 ; is length >= 8
|| ADD 1,A4,B6
[!B0] BNOP RA
|| [B0] SUB B0,1,B2
|| ZERO A1
|| ZERO B1
[B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 0,B4,A1
||[!B0] CMPLT 1,B4,B1
|| ZERO B5
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 2,B4,A1
||[!B0] CMPLT 3,B4,B1
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 4,B4,A1
||[!B0] CMPLT 5,B4,B1
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B2] BDEC cleanse_loop?,B2
||[!B0] CMPLT 6,B4,A1
[A1] STB A2,*A4++[2]
|| [B2] BDEC cleanse_loop?,B2
cleanse_loop?:
STNDW A3:A2,*A4++
|| SUB B4,8,B4
|| [B2] BDEC cleanse_loop?,B2
MV B4,B0 ; remaining bytes
|| ADD 1,A4,B6
|| BNOP RA
[B0] CMPLT 0,B0,A1
|| [B0] CMPLT 1,B0,B1
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 2,B0,A1
|| [B0] CMPLT 3,B0,B1
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 4,B0,A1
|| [B0] CMPLT 5,B0,B1
[A1] STB A2,*A4++[2]
|| [B1] STB B5,*B6++[2]
|| [B0] CMPLT 6,B0,A1
[A1] STB A2,*A4++[2]
.endasmfunc
.if 0
.global _CRYPTO_memcmp
_CRYPTO_memcmp:
.asmfunc
MV A6,B0
[!B0] BNOP RA
||[!B0] ZERO A4
|| [B0] ZERO A1:A0
[B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
[B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
[B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
[B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
[B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
XOR A5,B5,A1
|| [B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
memcmp_loop?:
OR A1,A0,A0
|| XOR A5,B5,A1
|| [B0] LDBU *A4++,A5
|| [B0] LDBU *B4++,B5
|| [B0] BDEC memcmp_loop?,B0
BNOP RA,3
ZERO A4
[A0] MVK 1,A4
.endasmfunc
.endif
.global _OPENSSL_atomic_add
_OPENSSL_atomic_add:
.asmfunc
BNOP atomic_store? ; pre-C64x+ systems are uni-processor, it's
|| LDW *A4,B5 ; enough to hold interrupts off through
; the load-update-store cycle to achieve
; atomicity
NOP
BNOP RA,3 ; and this branch stretches even over store
ADD B4,B5,B5
atomic_store?:
STW B5,*A4
|| MV B5,A4
.endasmfunc
.global _OPENSSL_wipe_cpu
_OPENSSL_wipe_cpu:
.asmfunc
ZERO A0
|| ZERO B0
|| ZERO A1
|| ZERO B1
ZERO A3:A2
|| MVD B0,B2
|| ZERO A4
|| ZERO B4
|| ZERO A5
|| ZERO B5
|| BNOP RA
ZERO A7:A6
|| ZERO B7:B6
|| ZERO A8
|| ZERO B8
|| ZERO A9
|| ZERO B9
ZERO A17:A16
|| ZERO B17:B16
|| ZERO A18
|| ZERO B18
|| ZERO A19
|| ZERO B19
ZERO A21:A20
|| ZERO B21:B20
|| ZERO A22
|| ZERO B22
|| ZERO A23
|| ZERO B23
ZERO A25:A24
|| ZERO B25:B24
|| ZERO A26
|| ZERO B26
|| ZERO A27
|| ZERO B27
ZERO A29:A28
|| ZERO B29:B28
|| ZERO A30
|| ZERO B30
|| ZERO A31
|| ZERO B31
.endasmfunc
CLFLUSH .macro CONTROL,ADDR,LEN
B passthrough?
|| STW ADDR,*CONTROL[0]
STW LEN,*CONTROL[1]
spinlock?:
LDW *CONTROL[1],A0
NOP 3
passthrough?:
NOP
[A0] BNOP spinlock?,5
.endm
.global _OPENSSL_instrument_bus
_OPENSSL_instrument_bus:
.asmfunc
MV B4,B0 ; reassign sizeof(output)
|| MV A4,B4 ; reassign output
|| MVK 0x00004030,A3
|| MVKL TIMER_BASE,B16
MV B0,A4 ; return value
|| MVK 1,A1
|| MVKH 0x01840000,A3 ; L1DWIBAR
|| MVKH TIMER_BASE,B16
LDW *B16[2],B8 ; collect 1st tick
|| MVK 0x00004010,A5
NOP 4
MV B8,B9 ; lasttick = tick
|| MVK 0,B7 ; lastdiff = 0
|| MVKH 0x01840000,A5 ; L2WIBAR
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
LDW *B4,B5
NOP 4
ADD B7,B5,B5
STW B5,*B4
bus_loop1?:
LDW *B16[2],B8
|| [B0] SUB B0,1,B0
NOP 4
SUB B8,B9,B7 ; lastdiff = tick - lasttick
|| MV B8,B9 ; lasttick = tick
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
LDW *B4,B5
NOP 4
ADD B7,B5,B5
STW B5,*B4 ; [!B1] is removed to flatten samples
|| ADDK 4,B4
|| [B0] BNOP bus_loop1?,5
BNOP RA,5
.endasmfunc
.global _OPENSSL_instrument_bus2
_OPENSSL_instrument_bus2:
.asmfunc
MV A6,B0 ; reassign max
|| MV B4,A6 ; reassing sizeof(output)
|| MVK 0x00004030,A3
|| MVKL TIMER_BASE,B16
MV A4,B4 ; reassign output
|| MVK 0,A4 ; return value
|| MVK 1,A1
|| MVKH 0x01840000,A3 ; L1DWIBAR
|| MVKH TIMER_BASE,B16
LDW *B16[2],B8 ; collect 1st tick
|| MVK 0x00004010,A5
NOP 4
MV B8,B9 ; lasttick = tick
|| MVK 0,B7 ; lastdiff = 0
|| MVKH 0x01840000,A5 ; L2WIBAR
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
LDW *B4,B5
NOP 4
ADD B7,B5,B5
STW B5,*B4
LDW *B16[2],B8 ; collect 1st diff
NOP 4
SUB B8,B9,B7 ; lastdiff = tick - lasttick
|| MV B8,B9 ; lasttick = tick
|| SUB B0,1,B0
bus_loop2?:
CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line
CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line
LDW *B4,B5
NOP 4
ADD B7,B5,B5
STW B5,*B4 ; [!B1] is removed to flatten samples
||[!B0] BNOP bus_loop2_done?,2
|| SUB B0,1,B0
LDW *B16[2],B8
NOP 4
SUB B8,B9,B8
|| MV B8,B9
CMPEQ B8,B7,B2
|| MV B8,B7
[!B2] ADDAW B4,1,B4
||[!B2] ADDK 1,A4
CMPEQ A4,A6,A2
[!A2] BNOP bus_loop2?,5
bus_loop2_done?:
BNOP RA,5
.endasmfunc
.if __TI_EABI__
.sect ".init_array"
.else
.sect ".pinit"
.endif
.align 4
.long _OPENSSL_rdtsc ; auto-start timer
___
print $code;
close STDOUT;