openssl/crypto/ec/asm/ecp_nistz256-armv4.pl
Richard Levitte 1aa89a7a3a Unify all assembler file generators
They now generally conform to the following argument sequence:

    script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
              $(PROCESSOR) <output file>

However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file.  This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).

While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.

There's a perl lesson in this, regarding operator priority...

This will always succeed, even when it fails:

    open FOO, "something" || die "ERR: $!";

The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:

    open FOO, "something";

This, however, will fail if "something" can't be opened:

    open FOO, "something" or die "ERR: $!";

The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.

Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-16 16:29:57 +02:00

1869 lines
45 KiB
Prolog
Executable file

#! /usr/bin/env perl
# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# ECP_NISTZ256 module for ARMv4.
#
# October 2014.
#
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
# http://eprint.iacr.org/2013/816. In the process of adaptation
# original .c module was made 32-bit savvy in order to make this
# implementation possible.
#
# with/without -DECP_NISTZ256_ASM
# Cortex-A8 +53-170%
# Cortex-A9 +76-205%
# Cortex-A15 +100-316%
# Snapdragon S4 +66-187%
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, server-side
# operation. Keep in mind that +200% means 3x improvement.
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
} else {
$output and open STDOUT,">$output";
}
$code.=<<___;
#include "arm_arch.h"
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open TABLE,"<ecp_nistz256_table.c" or
open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;
foreach(<TABLE>) {
s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
}
close TABLE;
# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
# 64*16*37-1 is because $#arr returns last valid index or @arr, not
# amount of elements.
die "insane number of elements" if ($#arr != 64*16*37-1);
$code.=<<___;
.rodata
.globl ecp_nistz256_precomputed
.type ecp_nistz256_precomputed,%object
.align 12
ecp_nistz256_precomputed:
___
########################################################################
# this conversion smashes P256_POINT_AFFINE by individual bytes with
# 64 byte interval, similar to
# 1111222233334444
# 1234123412341234
for(1..37) {
@tbl = splice(@arr,0,64*16);
for($i=0;$i<64;$i++) {
undef @line;
for($j=0;$j<64;$j++) {
push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
}
$code.=".byte\t";
$code.=join(',',map { sprintf "0x%02x",$_} @line);
$code.="\n";
}
}
$code.=<<___;
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
.text
.align 5
.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
.Lone:
.long 1,0,0,0,0,0,0,0
.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 6
___
########################################################################
# common register layout, note that $t2 is link register, so that if
# internal subroutine uses $t2, then it has to offload lr...
($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
map("r$_",(0..12,14));
($t0,$t3)=($ff,$a_ptr);
$code.=<<___;
@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_to_mont
.type ecp_nistz256_to_mont,%function
ecp_nistz256_to_mont:
adr $b_ptr,.LRR
b .Lecp_nistz256_mul_mont
.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_from_mont
.type ecp_nistz256_from_mont,%function
ecp_nistz256_from_mont:
adr $b_ptr,.Lone
b .Lecp_nistz256_mul_mont
.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_mul_by_2
.type ecp_nistz256_mul_by_2,%function
.align 4
ecp_nistz256_mul_by_2:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_mul_by_2
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
.type __ecp_nistz256_mul_by_2,%function
.align 4
__ecp_nistz256_mul_by_2:
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
ldr $a3,[$a_ptr,#12]
adcs $a1,$a1,$a1
ldr $a4,[$a_ptr,#16]
adcs $a2,$a2,$a2
ldr $a5,[$a_ptr,#20]
adcs $a3,$a3,$a3
ldr $a6,[$a_ptr,#24]
adcs $a4,$a4,$a4
ldr $a7,[$a_ptr,#28]
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
b .Lreduce_by_sub
.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl ecp_nistz256_add
.type ecp_nistz256_add,%function
.align 4
ecp_nistz256_add:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_add
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_add,.-ecp_nistz256_add
.type __ecp_nistz256_add,%function
.align 4
__ecp_nistz256_add:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
ldr $a3,[$a_ptr,#12]
ldr $a4,[$a_ptr,#16]
ldr $t0,[$b_ptr,#0]
ldr $a5,[$a_ptr,#20]
ldr $t1,[$b_ptr,#4]
ldr $a6,[$a_ptr,#24]
ldr $t2,[$b_ptr,#8]
ldr $a7,[$a_ptr,#28]
ldr $t3,[$b_ptr,#12]
adds $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
adcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
adcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
adcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
adcs $a4,$a4,$t0
adcs $a5,$a5,$t1
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
.Lreduce_by_sub:
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs $a0,$a0,#-1
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow $ff register...
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_add,.-__ecp_nistz256_add
@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_mul_by_3
.type ecp_nistz256_mul_by_3,%function
.align 4
ecp_nistz256_mul_by_3:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_mul_by_3
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
.type __ecp_nistz256_mul_by_3,%function
.align 4
__ecp_nistz256_mul_by_3:
str lr,[sp,#-4]! @ push lr
@ As multiplication by 3 is performed as 2*n+n, below are inline
@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
@ corresponding subroutines for details.
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
ldr $a3,[$a_ptr,#12]
adcs $a1,$a1,$a1
ldr $a4,[$a_ptr,#16]
adcs $a2,$a2,$a2
ldr $a5,[$a_ptr,#20]
adcs $a3,$a3,$a3
ldr $a6,[$a_ptr,#24]
adcs $a4,$a4,$a4
ldr $a7,[$a_ptr,#28]
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
adcs $a2,$a2,$ff
adcs $a3,$a3,#0
adcs $a4,$a4,#0
ldr $b_ptr,[$a_ptr,#0]
adcs $a5,$a5,#0
ldr $t1,[$a_ptr,#4]
adcs $a6,$a6,$ff,lsr#31
ldr $t2,[$a_ptr,#8]
adc $a7,$a7,$ff
ldr $t0,[$a_ptr,#12]
adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
ldr $b_ptr,[$a_ptr,#16]
adcs $a1,$a1,$t1
ldr $t1,[$a_ptr,#20]
adcs $a2,$a2,$t2
ldr $t2,[$a_ptr,#24]
adcs $a3,$a3,$t0
ldr $t3,[$a_ptr,#28]
adcs $a4,$a4,$b_ptr
adcs $a5,$a5,$t1
adcs $a6,$a6,$t2
mov $ff,#0
adcs $a7,$a7,$t3
adc $ff,$ff,#0
ldr lr,[sp],#4 @ pop lr
b .Lreduce_by_sub
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_div_by_2
.type ecp_nistz256_div_by_2,%function
.align 4
ecp_nistz256_div_by_2:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_div_by_2
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
.type __ecp_nistz256_div_by_2,%function
.align 4
__ecp_nistz256_div_by_2:
@ ret = (a is odd ? a+mod : a) >> 1
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
mov $ff,$a0,lsl#31 @ place least significant bit to most
@ significant position, now arithmetic
@ right shift by 31 will produce -1 or
@ 0, while logical right shift 1 or 0,
@ this is how modulus is conditionally
@ synthesized in this case...
ldr $a3,[$a_ptr,#12]
adds $a0,$a0,$ff,asr#31
ldr $a4,[$a_ptr,#16]
adcs $a1,$a1,$ff,asr#31
ldr $a5,[$a_ptr,#20]
adcs $a2,$a2,$ff,asr#31
ldr $a6,[$a_ptr,#24]
adcs $a3,$a3,#0
ldr $a7,[$a_ptr,#28]
adcs $a4,$a4,#0
mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
@ because it doesn't affect flags
adcs $a5,$a5,#0
orr $a0,$a0,$a1,lsl#31
adcs $a6,$a6,$ff,lsr#31
mov $b_ptr,#0
adcs $a7,$a7,$ff,asr#31
mov $a1,$a1,lsr#1
adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
orr $a1,$a1,$a2,lsl#31
mov $a2,$a2,lsr#1
str $a0,[$r_ptr,#0]
orr $a2,$a2,$a3,lsl#31
mov $a3,$a3,lsr#1
str $a1,[$r_ptr,#4]
orr $a3,$a3,$a4,lsl#31
mov $a4,$a4,lsr#1
str $a2,[$r_ptr,#8]
orr $a4,$a4,$a5,lsl#31
mov $a5,$a5,lsr#1
str $a3,[$r_ptr,#12]
orr $a5,$a5,$a6,lsl#31
mov $a6,$a6,lsr#1
str $a4,[$r_ptr,#16]
orr $a6,$a6,$a7,lsl#31
mov $a7,$a7,lsr#1
str $a5,[$r_ptr,#20]
orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl ecp_nistz256_sub
.type ecp_nistz256_sub,%function
.align 4
ecp_nistz256_sub:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_sub
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_sub,.-ecp_nistz256_sub
.type __ecp_nistz256_sub,%function
.align 4
__ecp_nistz256_sub:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
ldr $a3,[$a_ptr,#12]
ldr $a4,[$a_ptr,#16]
ldr $t0,[$b_ptr,#0]
ldr $a5,[$a_ptr,#20]
ldr $t1,[$b_ptr,#4]
ldr $a6,[$a_ptr,#24]
ldr $t2,[$b_ptr,#8]
ldr $a7,[$a_ptr,#28]
ldr $t3,[$b_ptr,#12]
subs $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$a4,$t0
sbcs $a5,$a5,$t1
sbcs $a6,$a6,$t2
sbcs $a7,$a7,$t3
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
.Lreduce_by_add:
@ if a-b borrows, add modulus.
@
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, $ff, and using it as
@ a whole or extracting single bit.
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_neg
.type ecp_nistz256_neg,%function
.align 4
ecp_nistz256_neg:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_neg
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_neg,.-ecp_nistz256_neg
.type __ecp_nistz256_neg,%function
.align 4
__ecp_nistz256_neg:
ldr $a0,[$a_ptr,#0]
eor $ff,$ff,$ff
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
subs $a0,$ff,$a0
ldr $a3,[$a_ptr,#12]
sbcs $a1,$ff,$a1
ldr $a4,[$a_ptr,#16]
sbcs $a2,$ff,$a2
ldr $a5,[$a_ptr,#20]
sbcs $a3,$ff,$a3
ldr $a6,[$a_ptr,#24]
sbcs $a4,$ff,$a4
ldr $a7,[$a_ptr,#28]
sbcs $a5,$ff,$a5
sbcs $a6,$ff,$a6
sbcs $a7,$ff,$a7
sbc $ff,$ff,$ff
b .Lreduce_by_add
.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
___
{
my @acc=map("r$_",(3..11));
my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
$code.=<<___;
@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_sqr_mont
.type ecp_nistz256_sqr_mont,%function
.align 4
ecp_nistz256_sqr_mont:
mov $b_ptr,$a_ptr
b .Lecp_nistz256_mul_mont
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl ecp_nistz256_mul_mont
.type ecp_nistz256_mul_mont,%function
.align 4
ecp_nistz256_mul_mont:
.Lecp_nistz256_mul_mont:
stmdb sp!,{r4-r12,lr}
bl __ecp_nistz256_mul_mont
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
.type __ecp_nistz256_mul_mont,%function
.align 4
__ecp_nistz256_mul_mont:
stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
ldr $bj,[$b_ptr,#0] @ b[0]
ldmia $a_ptr,{@acc[1]-@acc[8]}
umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
@ that it can be addressed
@ without spending register
@ on address
umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
umull @acc[2],$t1,@acc[3],$bj
adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
umull @acc[3],$t2,@acc[4],$bj
adcs @acc[2],@acc[2],$t0
umull @acc[4],$t3,@acc[5],$bj
adcs @acc[3],@acc[3],$t1
umull @acc[5],$t0,@acc[6],$bj
adcs @acc[4],@acc[4],$t2
umull @acc[6],$t1,@acc[7],$bj
adcs @acc[5],@acc[5],$t3
umull @acc[7],$t2,@acc[8],$bj
adcs @acc[6],@acc[6],$t0
adcs @acc[7],@acc[7],$t1
eor $t3,$t3,$t3 @ first overflow bit is zero
adc @acc[8],$t2,#0
___
for(my $i=1;$i<8;$i++) {
my $t4=@acc[0];
# Reduction iteration is normally performed by accumulating
# result of multiplication of modulus by "magic" digit [and
# omitting least significant word, which is guaranteed to
# be 0], but thanks to special form of modulus and "magic"
# digit being equal to least significant word, it can be
# performed with additions and subtractions alone. Indeed:
#
# ffff.0001.0000.0000.0000.ffff.ffff.ffff
# * abcd
# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
#
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
# rewrite above as:
#
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
# - abcd.0000.0000.0000.0000.0000.0000.abcd
#
# or marking redundant operations:
#
# xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
# + abcd.0000.abcd.0000.0000.abcd.----.----.----
# - abcd.----.----.----.----.----.----.----
$code.=<<___;
@ multiplication-less reduction $i
adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
ldr $bj,[sp,#40] @ restore b_ptr
adcs @acc[4],@acc[4],#0 @ r[4]+=0
adcs @acc[5],@acc[5],#0 @ r[5]+=0
adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
ldr $t1,[sp,#0] @ load a[0]
adcs @acc[7],@acc[7],#0 @ r[7]+=0
ldr $bj,[$bj,#4*$i] @ load b[i]
adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
eor $t0,$t0,$t0
adc $t3,$t3,#0 @ overflow bit
subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
ldr $t2,[sp,#4] @ a[1]
sbcs @acc[8],@acc[8],#0 @ r[8]-=0
umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
eor $t1,$t1,$t1
sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr $t3,[sp,#8] @ a[2]
umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
str @acc[0],[sp,#36] @ temporarily offload overflow
eor $t2,$t2,$t2
ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
eor $t3,$t3,$t3
adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
ldr $t0,[sp,#16] @ a[4]
umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
eor $t4,$t4,$t4
adcs @acc[3],@acc[3],$t1
ldr $t1,[sp,#20] @ a[5]
umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
eor $t0,$t0,$t0
adcs @acc[4],@acc[4],$t2
ldr $t2,[sp,#24] @ a[6]
umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
eor $t1,$t1,$t1
adcs @acc[5],@acc[5],$t3
ldr $t3,[sp,#28] @ a[7]
umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
eor $t2,$t2,$t2
adcs @acc[6],@acc[6],$t4
ldr @acc[0],[sp,#36] @ restore overflow bit
umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
eor $t3,$t3,$t3
adcs @acc[7],@acc[7],$t0
adcs @acc[8],@acc[8],$t1
adcs @acc[0],$acc[0],$t2
adc $t3,$t3,#0 @ new overflow bit
___
push(@acc,shift(@acc)); # rotate registers, so that
# "r[i]" becomes r[i]
}
$code.=<<___;
@ last multiplication-less reduction
adds @acc[3],@acc[3],@acc[0]
ldr $r_ptr,[sp,#32] @ restore r_ptr
adcs @acc[4],@acc[4],#0
adcs @acc[5],@acc[5],#0
adcs @acc[6],@acc[6],@acc[0]
adcs @acc[7],@acc[7],#0
adcs @acc[8],@acc[8],@acc[0]
adc $t3,$t3,#0
subs @acc[7],@acc[7],@acc[0]
sbcs @acc[8],@acc[8],#0
sbc @acc[0],$t3,#0 @ overflow bit
@ Final step is "if result > mod, subtract mod", but we do it
@ "other way around", namely subtract modulus from result
@ and if it borrowed, add modulus back.
adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
sbcs @acc[4],@acc[4],#0
sbcs @acc[5],@acc[5],#0
sbcs @acc[6],@acc[6],#0
sbcs @acc[7],@acc[7],#1
adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
ldr lr,[sp,#44] @ restore lr
sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
add sp,sp,#48
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, @acc[0], and using it as
@ a whole or extracting single bit.
adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
adcs @acc[2],@acc[2],@acc[0]
str @acc[1],[$r_ptr,#0]
adcs @acc[3],@acc[3],@acc[0]
str @acc[2],[$r_ptr,#4]
adcs @acc[4],@acc[4],#0
str @acc[3],[$r_ptr,#8]
adcs @acc[5],@acc[5],#0
str @acc[4],[$r_ptr,#12]
adcs @acc[6],@acc[6],#0
str @acc[5],[$r_ptr,#16]
adcs @acc[7],@acc[7],@acc[0],lsr#31
str @acc[6],[$r_ptr,#20]
adc @acc[8],@acc[8],@acc[0]
str @acc[7],[$r_ptr,#24]
str @acc[8],[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
___
}
{
my ($out,$inp,$index,$mask)=map("r$_",(0..3));
$code.=<<___;
@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
@ int r2);
.globl ecp_nistz256_scatter_w5
.type ecp_nistz256_scatter_w5,%function
.align 5
ecp_nistz256_scatter_w5:
stmdb sp!,{r4-r11}
add $out,$out,$index,lsl#2
ldmia $inp!,{r4-r11} @ X
str r4,[$out,#64*0-4]
str r5,[$out,#64*1-4]
str r6,[$out,#64*2-4]
str r7,[$out,#64*3-4]
str r8,[$out,#64*4-4]
str r9,[$out,#64*5-4]
str r10,[$out,#64*6-4]
str r11,[$out,#64*7-4]
add $out,$out,#64*8
ldmia $inp!,{r4-r11} @ Y
str r4,[$out,#64*0-4]
str r5,[$out,#64*1-4]
str r6,[$out,#64*2-4]
str r7,[$out,#64*3-4]
str r8,[$out,#64*4-4]
str r9,[$out,#64*5-4]
str r10,[$out,#64*6-4]
str r11,[$out,#64*7-4]
add $out,$out,#64*8
ldmia $inp,{r4-r11} @ Z
str r4,[$out,#64*0-4]
str r5,[$out,#64*1-4]
str r6,[$out,#64*2-4]
str r7,[$out,#64*3-4]
str r8,[$out,#64*4-4]
str r9,[$out,#64*5-4]
str r10,[$out,#64*6-4]
str r11,[$out,#64*7-4]
ldmia sp!,{r4-r11}
#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
mov pc,lr
#endif
.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
@ int r2);
.globl ecp_nistz256_gather_w5
.type ecp_nistz256_gather_w5,%function
.align 5
ecp_nistz256_gather_w5:
stmdb sp!,{r4-r11}
cmp $index,#0
mov $mask,#0
#ifdef __thumb2__
itt ne
#endif
subne $index,$index,#1
movne $mask,#-1
add $inp,$inp,$index,lsl#2
ldr r4,[$inp,#64*0]
ldr r5,[$inp,#64*1]
ldr r6,[$inp,#64*2]
and r4,r4,$mask
ldr r7,[$inp,#64*3]
and r5,r5,$mask
ldr r8,[$inp,#64*4]
and r6,r6,$mask
ldr r9,[$inp,#64*5]
and r7,r7,$mask
ldr r10,[$inp,#64*6]
and r8,r8,$mask
ldr r11,[$inp,#64*7]
add $inp,$inp,#64*8
and r9,r9,$mask
and r10,r10,$mask
and r11,r11,$mask
stmia $out!,{r4-r11} @ X
ldr r4,[$inp,#64*0]
ldr r5,[$inp,#64*1]
ldr r6,[$inp,#64*2]
and r4,r4,$mask
ldr r7,[$inp,#64*3]
and r5,r5,$mask
ldr r8,[$inp,#64*4]
and r6,r6,$mask
ldr r9,[$inp,#64*5]
and r7,r7,$mask
ldr r10,[$inp,#64*6]
and r8,r8,$mask
ldr r11,[$inp,#64*7]
add $inp,$inp,#64*8
and r9,r9,$mask
and r10,r10,$mask
and r11,r11,$mask
stmia $out!,{r4-r11} @ Y
ldr r4,[$inp,#64*0]
ldr r5,[$inp,#64*1]
ldr r6,[$inp,#64*2]
and r4,r4,$mask
ldr r7,[$inp,#64*3]
and r5,r5,$mask
ldr r8,[$inp,#64*4]
and r6,r6,$mask
ldr r9,[$inp,#64*5]
and r7,r7,$mask
ldr r10,[$inp,#64*6]
and r8,r8,$mask
ldr r11,[$inp,#64*7]
and r9,r9,$mask
and r10,r10,$mask
and r11,r11,$mask
stmia $out,{r4-r11} @ Z
ldmia sp!,{r4-r11}
#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
mov pc,lr
#endif
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
@ int r2);
.globl ecp_nistz256_scatter_w7
.type ecp_nistz256_scatter_w7,%function
.align 5
ecp_nistz256_scatter_w7:
add $out,$out,$index
mov $index,#64/4
.Loop_scatter_w7:
ldr $mask,[$inp],#4
subs $index,$index,#1
strb $mask,[$out,#64*0]
mov $mask,$mask,lsr#8
strb $mask,[$out,#64*1]
mov $mask,$mask,lsr#8
strb $mask,[$out,#64*2]
mov $mask,$mask,lsr#8
strb $mask,[$out,#64*3]
add $out,$out,#64*4
bne .Loop_scatter_w7
#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
mov pc,lr
#endif
.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
@ int r2);
.globl ecp_nistz256_gather_w7
.type ecp_nistz256_gather_w7,%function
.align 5
ecp_nistz256_gather_w7:
stmdb sp!,{r4-r7}
cmp $index,#0
mov $mask,#0
#ifdef __thumb2__
itt ne
#endif
subne $index,$index,#1
movne $mask,#-1
add $inp,$inp,$index
mov $index,#64/4
nop
.Loop_gather_w7:
ldrb r4,[$inp,#64*0]
subs $index,$index,#1
ldrb r5,[$inp,#64*1]
ldrb r6,[$inp,#64*2]
ldrb r7,[$inp,#64*3]
add $inp,$inp,#64*4
orr r4,r4,r5,lsl#8
orr r4,r4,r6,lsl#16
orr r4,r4,r7,lsl#24
and r4,r4,$mask
str r4,[$out],#4
bne .Loop_gather_w7
ldmia sp!,{r4-r7}
#if __ARM_ARCH__>=5 || defined(__thumb__)
bx lr
#else
mov pc,lr
#endif
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
if (0) {
# In comparison to integer-only equivalent of below subroutine:
#
# Cortex-A8 +10%
# Cortex-A9 -10%
# Snapdragon S4 +5%
#
# As not all time is spent in multiplication, overall impact is deemed
# too low to care about.
my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
my $mask="q4";
my $mult="q5";
my @AxB=map("q$_",(8..15));
my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.globl ecp_nistz256_mul_mont_neon
.type ecp_nistz256_mul_mont_neon,%function
.align 5
ecp_nistz256_mul_mont_neon:
mov ip,sp
stmdb sp!,{r4-r9}
vstmdb sp!,{q4-q5} @ ABI specification says so
sub $toutptr,sp,#40
vld1.32 {${Bi}[0]},[$bptr,:32]!
veor $zero,$zero,$zero
vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
vzip.16 $Bi,$zero
mov sp,$toutptr @ alloca
vmov.i64 $mask,#0xffff
vmull.u32 @AxB[0],$Bi,${A0}[0]
vmull.u32 @AxB[1],$Bi,${A0}[1]
vmull.u32 @AxB[2],$Bi,${A1}[0]
vmull.u32 @AxB[3],$Bi,${A1}[1]
vshr.u64 $temp,@AxB[0]#lo,#16
vmull.u32 @AxB[4],$Bi,${A2}[0]
vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
vmull.u32 @AxB[5],$Bi,${A2}[1]
vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
vmull.u32 @AxB[6],$Bi,${A3}[0]
vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
vmull.u32 @AxB[7],$Bi,${A3}[1]
___
for($i=1;$i<8;$i++) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]!
veor $zero,$zero,$zero
vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
vshl.u64 $mult,@AxB[0],#32
vadd.u64 @AxB[3],@AxB[3],@AxB[0]
vsub.u64 $mult,$mult,@AxB[0]
vzip.16 $Bi,$zero
vadd.u64 @AxB[6],@AxB[6],@AxB[0]
vadd.u64 @AxB[7],@AxB[7],$mult
___
push(@AxB,shift(@AxB));
$code.=<<___;
vmlal.u32 @AxB[0],$Bi,${A0}[0]
vmlal.u32 @AxB[1],$Bi,${A0}[1]
vmlal.u32 @AxB[2],$Bi,${A1}[0]
vmlal.u32 @AxB[3],$Bi,${A1}[1]
vshr.u64 $temp,@AxB[0]#lo,#16
vmlal.u32 @AxB[4],$Bi,${A2}[0]
vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
vmlal.u32 @AxB[5],$Bi,${A2}[1]
vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
vmlal.u32 @AxB[6],$Bi,${A3}[0]
vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
vmull.u32 @AxB[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
vshl.u64 $mult,@AxB[0],#32
vadd.u64 @AxB[3],@AxB[3],@AxB[0]
vsub.u64 $mult,$mult,@AxB[0]
vadd.u64 @AxB[6],@AxB[6],@AxB[0]
vadd.u64 @AxB[7],@AxB[7],$mult
vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
vshr.u64 $temp,@AxB[1]#hi,#16
vzip.16 @AxB[1]#lo,@AxB[1]#hi
___
foreach (2..7) {
$code.=<<___;
vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
vshr.u64 $temp,@AxB[$_]#lo,#16
vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
vshr.u64 $temp,@AxB[$_]#hi,#16
vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
___
}
$code.=<<___;
vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
vst1.32 {$temp},[$toutptr] @ upper 33 bits
ldr r1,[sp,#0]
ldr r2,[sp,#4]
ldr r3,[sp,#8]
subs r1,r1,#-1
ldr r4,[sp,#12]
sbcs r2,r2,#-1
ldr r5,[sp,#16]
sbcs r3,r3,#-1
ldr r6,[sp,#20]
sbcs r4,r4,#0
ldr r7,[sp,#24]
sbcs r5,r5,#0
ldr r8,[sp,#28]
sbcs r6,r6,#0
ldr r9,[sp,#32] @ top-most bit
sbcs r7,r7,#1
sub sp,ip,#40+16
sbcs r8,r8,#-1
sbc r9,r9,#0
vldmia sp!,{q4-q5}
adds r1,r1,r9
adcs r2,r2,r9
str r1,[$rptr,#0]
adcs r3,r3,r9
str r2,[$rptr,#4]
adcs r4,r4,#0
str r3,[$rptr,#8]
adcs r5,r5,#0
str r4,[$rptr,#12]
adcs r6,r6,#0
str r5,[$rptr,#16]
adcs r7,r7,r9,lsr#31
str r6,[$rptr,#20]
adcs r8,r8,r9
str r7,[$rptr,#24]
str r8,[$rptr,#28]
ldmia sp!,{r4-r9}
bx lr
.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
#endif
___
}
{{{
########################################################################
# Below $aN assignment matches order in which 256-bit result appears in
# register bank at return from __ecp_nistz256_mul_mont, so that we can
# skip over reloading it from memory. This means that below functions
# use custom calling sequence accepting 256-bit input in registers,
# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
#
# See their "normal" counterparts for insights on calculations.
my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
$t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
my $ff=$b_ptr;
$code.=<<___;
.type __ecp_nistz256_sub_from,%function
.align 5
__ecp_nistz256_sub_from:
str lr,[sp,#-4]! @ push lr
ldr $t0,[$b_ptr,#0]
ldr $t1,[$b_ptr,#4]
ldr $t2,[$b_ptr,#8]
ldr $t3,[$b_ptr,#12]
subs $a0,$a0,$t0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$a1,$t1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$a2,$t2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$a3,$t3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$a4,$t0
sbcs $a5,$a5,$t1
sbcs $a6,$a6,$t2
sbcs $a7,$a7,$t3
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
.type __ecp_nistz256_sub_morf,%function
.align 5
__ecp_nistz256_sub_morf:
str lr,[sp,#-4]! @ push lr
ldr $t0,[$b_ptr,#0]
ldr $t1,[$b_ptr,#4]
ldr $t2,[$b_ptr,#8]
ldr $t3,[$b_ptr,#12]
subs $a0,$t0,$a0
ldr $t0,[$b_ptr,#16]
sbcs $a1,$t1,$a1
ldr $t1,[$b_ptr,#20]
sbcs $a2,$t2,$a2
ldr $t2,[$b_ptr,#24]
sbcs $a3,$t3,$a3
ldr $t3,[$b_ptr,#28]
sbcs $a4,$t0,$a4
sbcs $a5,$t1,$a5
sbcs $a6,$t2,$a6
sbcs $a7,$t3,$a7
sbc $ff,$ff,$ff @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
.type __ecp_nistz256_add_self,%function
.align 4
__ecp_nistz256_add_self:
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
adcs $a1,$a1,$a1
adcs $a2,$a2,$a2
adcs $a3,$a3,$a3
adcs $a4,$a4,$a4
adcs $a5,$a5,$a5
adcs $a6,$a6,$a6
mov $ff,#0
adcs $a7,$a7,$a7
adc $ff,$ff,#0
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs $a0,$a0,#-1
sbcs $a1,$a1,#-1
sbcs $a2,$a2,#-1
sbcs $a3,$a3,#0
sbcs $a4,$a4,#0
sbcs $a5,$a5,#0
sbcs $a6,$a6,#1
sbcs $a7,$a7,#-1
sbc $ff,$ff,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow $ff register...
adds $a0,$a0,$ff @ add synthesized modulus
adcs $a1,$a1,$ff
str $a0,[$r_ptr,#0]
adcs $a2,$a2,$ff
str $a1,[$r_ptr,#4]
adcs $a3,$a3,#0
str $a2,[$r_ptr,#8]
adcs $a4,$a4,#0
str $a3,[$r_ptr,#12]
adcs $a5,$a5,#0
str $a4,[$r_ptr,#16]
adcs $a6,$a6,$ff,lsr#31
str $a5,[$r_ptr,#20]
adcs $a7,$a7,$ff
str $a6,[$r_ptr,#24]
str $a7,[$r_ptr,#28]
mov pc,lr
.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
___
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
{
my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
# above map() describes stack layout with 5 temporary
# 256-bit vectors on top. Then note that we push
# starting from r0, which means that we have copy of
# input arguments just below these temporary vectors.
$code.=<<___;
.globl ecp_nistz256_point_double
.type ecp_nistz256_point_double,%function
.align 5
ecp_nistz256_point_double:
stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
sub sp,sp,#32*5
.Lpoint_double_shortcut:
add r3,sp,#$in_x
ldmia $a_ptr!,{r4-r11} @ copy in_x
stmia r3,{r4-r11}
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
add $b_ptr,$a_ptr,#32
add $a_ptr,$a_ptr,#32
add $r_ptr,sp,#$Zsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
ldr $b_ptr,[sp,#32*5+4]
add $a_ptr,$b_ptr,#32
add $b_ptr,$b_ptr,#64
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
ldr $r_ptr,[sp,#32*5]
add $r_ptr,$r_ptr,#64
bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$M
bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$Zsqr
bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
add $a_ptr,sp,#$Zsqr
add $b_ptr,sp,#$M
add $r_ptr,sp,#$M
bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$tmp0
add $r_ptr,$r_ptr,#32
bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
add $a_ptr,sp,#$M
add $r_ptr,sp,#$M
bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
add $r_ptr,sp,#$tmp0
bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$M
add $b_ptr,sp,#$M
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
add $b_ptr,sp,#$tmp0
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
add $a_ptr,sp,#$M
add $b_ptr,sp,#$S
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
ldr $r_ptr,[sp,#32*5]
add $b_ptr,$r_ptr,#32
add $r_ptr,$r_ptr,#32
bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
___
}
########################################################################
# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
# const P256_POINT *in2);
{
my ($res_x,$res_y,$res_z,
$in1_x,$in1_y,$in1_z,
$in2_x,$in2_y,$in2_z,
$H,$Hsqr,$R,$Rsqr,$Hcub,
$U1,$U2,$S1,$S2)=map(32*$_,(0..17));
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
# above map() describes stack layout with 18 temporary
# 256-bit vectors on top. Then note that we push
# starting from r0, which means that we have copy of
# input arguments just below these temporary vectors.
# We use three of them for !in1infty, !in2intfy and
# result of check for zero.
$code.=<<___;
.globl ecp_nistz256_point_add
.type ecp_nistz256_point_add,%function
.align 5
ecp_nistz256_point_add:
stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
sub sp,sp,#32*18+16
ldmia $b_ptr!,{r4-r11} @ copy in2_x
add r3,sp,#$in2_x
stmia r3!,{r4-r11}
ldmia $b_ptr!,{r4-r11} @ copy in2_y
stmia r3!,{r4-r11}
ldmia $b_ptr,{r4-r11} @ copy in2_z
orr r12,r4,r5
orr r12,r12,r6
orr r12,r12,r7
orr r12,r12,r8
orr r12,r12,r9
orr r12,r12,r10
orr r12,r12,r11
cmp r12,#0
#ifdef __thumb2__
it ne
#endif
movne r12,#-1
stmia r3,{r4-r11}
str r12,[sp,#32*18+8] @ !in2infty
ldmia $a_ptr!,{r4-r11} @ copy in1_x
add r3,sp,#$in1_x
stmia r3!,{r4-r11}
ldmia $a_ptr!,{r4-r11} @ copy in1_y
stmia r3!,{r4-r11}
ldmia $a_ptr,{r4-r11} @ copy in1_z
orr r12,r4,r5
orr r12,r12,r6
orr r12,r12,r7
orr r12,r12,r8
orr r12,r12,r9
orr r12,r12,r10
orr r12,r12,r11
cmp r12,#0
#ifdef __thumb2__
it ne
#endif
movne r12,#-1
stmia r3,{r4-r11}
str r12,[sp,#32*18+4] @ !in1infty
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$in2_z
add $r_ptr,sp,#$Z2sqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$Z1sqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$Z2sqr
add $r_ptr,sp,#$S1
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$Z1sqr
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
add $a_ptr,sp,#$in1_y
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$S1
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
add $a_ptr,sp,#$in2_y
add $b_ptr,sp,#$S2
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$R
bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
orr $a0,$a0,$a1 @ see if result is zero
orr $a2,$a2,$a3
orr $a4,$a4,$a5
orr $a0,$a0,$a2
orr $a4,$a4,$a6
orr $a0,$a0,$a7
add $a_ptr,sp,#$in1_x
orr $a0,$a0,$a4
add $b_ptr,sp,#$Z2sqr
str $a0,[sp,#32*18+12]
add $r_ptr,sp,#$U1
bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
add $a_ptr,sp,#$in2_x
add $b_ptr,sp,#$Z1sqr
add $r_ptr,sp,#$U2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
add $b_ptr,sp,#$U1
add $r_ptr,sp,#$H
bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
orr $a0,$a0,$a1 @ see if result is zero
orr $a2,$a2,$a3
orr $a4,$a4,$a5
orr $a0,$a0,$a2
orr $a4,$a4,$a6
orr $a0,$a0,$a7
orrs $a0,$a0,$a4
bne .Ladd_proceed @ is_equal(U1,U2)?
ldr $t0,[sp,#32*18+4]
ldr $t1,[sp,#32*18+8]
ldr $t2,[sp,#32*18+12]
tst $t0,$t1
beq .Ladd_proceed @ (in1infty || in2infty)?
tst $t2,$t2
beq .Ladd_double @ is_equal(S1,S2)?
ldr $r_ptr,[sp,#32*18+16]
eor r4,r4,r4
eor r5,r5,r5
eor r6,r6,r6
eor r7,r7,r7
eor r8,r8,r8
eor r9,r9,r9
eor r10,r10,r10
eor r11,r11,r11
stmia $r_ptr!,{r4-r11}
stmia $r_ptr!,{r4-r11}
stmia $r_ptr!,{r4-r11}
b .Ladd_done
.align 4
.Ladd_double:
ldr $a_ptr,[sp,#32*18+20]
add sp,sp,#32*(18-5)+16 @ difference in frame sizes
b .Lpoint_double_shortcut
.align 4
.Ladd_proceed:
add $a_ptr,sp,#$R
add $b_ptr,sp,#$R
add $r_ptr,sp,#$Rsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$res_z
bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$H
add $r_ptr,sp,#$Hsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$res_z
add $r_ptr,sp,#$res_z
bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$Hsqr
add $r_ptr,sp,#$Hcub
bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
add $a_ptr,sp,#$Hsqr
add $b_ptr,sp,#$U1
add $r_ptr,sp,#$U2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
add $r_ptr,sp,#$Hsqr
bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
add $b_ptr,sp,#$Rsqr
add $r_ptr,sp,#$res_x
bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
add $b_ptr,sp,#$Hcub
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
add $b_ptr,sp,#$U2
add $r_ptr,sp,#$res_y
bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
add $a_ptr,sp,#$Hcub
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$res_y
add $r_ptr,sp,#$res_y
bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
add $b_ptr,sp,#$S2
bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
ldr r11,[sp,#32*18+4] @ !in1intfy
ldr r12,[sp,#32*18+8] @ !in2intfy
add r1,sp,#$res_x
add r2,sp,#$in2_x
and r10,r11,r12
mvn r11,r11
add r3,sp,#$in1_x
and r11,r11,r12
mvn r12,r12
ldr $r_ptr,[sp,#32*18+16]
___
for($i=0;$i<96;$i+=8) { # conditional moves
$code.=<<___;
ldmia r1!,{r4-r5} @ res_x
ldmia r2!,{r6-r7} @ in2_x
ldmia r3!,{r8-r9} @ in1_x
and r4,r4,r10
and r5,r5,r10
and r6,r6,r11
and r7,r7,r11
and r8,r8,r12
and r9,r9,r12
orr r4,r4,r6
orr r5,r5,r7
orr r4,r4,r8
orr r5,r5,r9
stmia $r_ptr!,{r4-r5}
___
}
$code.=<<___;
.Ladd_done:
add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
___
}
########################################################################
# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
# const P256_POINT_AFFINE *in2);
{
my ($res_x,$res_y,$res_z,
$in1_x,$in1_y,$in1_z,
$in2_x,$in2_y,
$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
my $Z1sqr = $S2;
# above map() describes stack layout with 18 temporary
# 256-bit vectors on top. Then note that we push
# starting from r0, which means that we have copy of
# input arguments just below these temporary vectors.
# We use two of them for !in1infty, !in2intfy.
my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
$code.=<<___;
.globl ecp_nistz256_point_add_affine
.type ecp_nistz256_point_add_affine,%function
.align 5
ecp_nistz256_point_add_affine:
stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
sub sp,sp,#32*15
ldmia $a_ptr!,{r4-r11} @ copy in1_x
add r3,sp,#$in1_x
stmia r3!,{r4-r11}
ldmia $a_ptr!,{r4-r11} @ copy in1_y
stmia r3!,{r4-r11}
ldmia $a_ptr,{r4-r11} @ copy in1_z
orr r12,r4,r5
orr r12,r12,r6
orr r12,r12,r7
orr r12,r12,r8
orr r12,r12,r9
orr r12,r12,r10
orr r12,r12,r11
cmp r12,#0
#ifdef __thumb2__
it ne
#endif
movne r12,#-1
stmia r3,{r4-r11}
str r12,[sp,#32*15+4] @ !in1infty
ldmia $b_ptr!,{r4-r11} @ copy in2_x
add r3,sp,#$in2_x
orr r12,r4,r5
orr r12,r12,r6
orr r12,r12,r7
orr r12,r12,r8
orr r12,r12,r9
orr r12,r12,r10
orr r12,r12,r11
stmia r3!,{r4-r11}
ldmia $b_ptr!,{r4-r11} @ copy in2_y
orr r12,r12,r4
orr r12,r12,r5
orr r12,r12,r6
orr r12,r12,r7
orr r12,r12,r8
orr r12,r12,r9
orr r12,r12,r10
orr r12,r12,r11
stmia r3!,{r4-r11}
cmp r12,#0
#ifdef __thumb2__
it ne
#endif
movne r12,#-1
str r12,[sp,#32*15+8] @ !in2infty
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$Z1sqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
add $a_ptr,sp,#$Z1sqr
add $b_ptr,sp,#$in2_x
add $r_ptr,sp,#$U2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
add $b_ptr,sp,#$in1_x
add $r_ptr,sp,#$H
bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
add $a_ptr,sp,#$Z1sqr
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$res_z
bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
add $a_ptr,sp,#$in2_y
add $b_ptr,sp,#$S2
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
add $b_ptr,sp,#$in1_y
add $r_ptr,sp,#$R
bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$H
add $r_ptr,sp,#$Hsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$R
add $r_ptr,sp,#$Rsqr
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$Hsqr
add $r_ptr,sp,#$Hcub
bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
add $a_ptr,sp,#$Hsqr
add $b_ptr,sp,#$in1_x
add $r_ptr,sp,#$U2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
add $r_ptr,sp,#$Hsqr
bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
add $b_ptr,sp,#$Rsqr
add $r_ptr,sp,#$res_x
bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
add $b_ptr,sp,#$Hcub
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
add $b_ptr,sp,#$U2
add $r_ptr,sp,#$res_y
bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
add $a_ptr,sp,#$Hcub
add $b_ptr,sp,#$in1_y
add $r_ptr,sp,#$S2
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$res_y
add $r_ptr,sp,#$res_y
bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
add $b_ptr,sp,#$S2
bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
ldr r11,[sp,#32*15+4] @ !in1intfy
ldr r12,[sp,#32*15+8] @ !in2intfy
add r1,sp,#$res_x
add r2,sp,#$in2_x
and r10,r11,r12
mvn r11,r11
add r3,sp,#$in1_x
and r11,r11,r12
mvn r12,r12
ldr $r_ptr,[sp,#32*15]
___
for($i=0;$i<64;$i+=8) { # conditional moves
$code.=<<___;
ldmia r1!,{r4-r5} @ res_x
ldmia r2!,{r6-r7} @ in2_x
ldmia r3!,{r8-r9} @ in1_x
and r4,r4,r10
and r5,r5,r10
and r6,r6,r11
and r7,r7,r11
and r8,r8,r12
and r9,r9,r12
orr r4,r4,r6
orr r5,r5,r7
orr r4,r4,r8
orr r5,r5,r9
stmia $r_ptr!,{r4-r5}
___
}
for(;$i<96;$i+=8) {
my $j=($i-64)/4;
$code.=<<___;
ldmia r1!,{r4-r5} @ res_z
ldmia r3!,{r8-r9} @ in1_z
and r4,r4,r10
and r5,r5,r10
and r6,r11,#@ONE_mont[$j]
and r7,r11,#@ONE_mont[$j+1]
and r8,r8,r12
and r9,r9,r12
orr r4,r4,r6
orr r5,r5,r7
orr r4,r4,r8
orr r5,r5,r9
stmia $r_ptr!,{r4-r5}
___
}
$code.=<<___;
add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
___
} }}}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
close STDOUT; # enforce flush