tdfonline.com.ar - c99shell

!C99Shell v. 2.1 [PHP 8 Update] [02.02.2022]!
Software: Apache/2.4.53 (Unix) OpenSSL/1.1.1o PHP/7.4.29 mod_perl/2.0.12 Perl/v5.34.1. PHP/7.4.29 uname -a: Linux vps-2738122-x 4.15.0-213-generic #224-Ubuntu SMP Mon Jun 19 13:30:12 UTC 2023 x86_64 uid=1(daemon) gid=1(daemon) grupos=1(daemon) Safe-mode: OFF (not secure) /opt/apex_tdfonline/proyectos/tdfonline/www/docs/openssl/crypto/ec/asm/ drwxr-xr-x Free 12.04 GB of 61.93 GB (19.43%) Encoder Tools Proc. FTP brute Sec. SQL PHP-code Update Feedback Self remove Logout

#! /usr/bin/env perl
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# ECP_NISTZ256 module for PPC64.
#
# August 2016.
#
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
# http://eprint.iacr.org/2013/816.
#
#            with/without -DECP_NISTZ256_ASM
# POWER7        +260-530%
# POWER8        +220-340%

# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour \"$output\""
    or die "can't call $xlate: $!";
*STDOUT=*OUT;

my $sp="r1";

{
my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
    $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
    map("r$_",(3..12,22..31));

my ($acc6,$acc7)=($bp,$bi);    # used in __ecp_nistz256_sqr_mont

$code.=<<___;
.machine    "any"
.text
___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open TABLE,"<ecp_nistz256_table.c"        or
open TABLE,"<${dir}../ecp_nistz256_table.c"    or
die "failed to open ecp_nistz256_table.c:",$!;

use integer;

foreach(<TABLE>) {
    s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
}
close TABLE;

# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
# 64*16*37-1 is because $#arr returns last valid index or @arr, not
# amount of elements.
die "insane number of elements" if ($#arr != 64*16*37-1);

$code.=<<___;
.type    ecp_nistz256_precomputed,\@object
.globl    ecp_nistz256_precomputed
.align    12
ecp_nistz256_precomputed:
___
########################################################################
# this conversion smashes P256_POINT_AFFINE by individual bytes with
# 64 byte interval, similar to
#    1111222233334444
#    1234123412341234
for(1..37) {
    @tbl = splice(@arr,0,64*16);
    for($i=0;$i<64;$i++) {
        undef @line;
        for($j=0;$j<64;$j++) {
            push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
        }
        $code.=".byte\t";
        $code.=join(',',map { sprintf "0x%02x",$_} @line);
        $code.="\n";
    }
}

$code.=<<___;
.size    ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
.asciz    "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"

# void    ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
#                         const BN_ULONG x2[4]);
.globl    ecp_nistz256_mul_mont
.align    5
ecp_nistz256_mul_mont:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r22,48($sp)
    std    r23,56($sp)
    std    r24,64($sp)
    std    r25,72($sp)
    std    r26,80($sp)
    std    r27,88($sp)
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $a0,0($ap)
    ld    $bi,0($bp)
    ld    $a1,8($ap)
    ld    $a2,16($ap)
    ld    $a3,24($ap)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_mul_mont

    mtlr    r0
    ld    r22,48($sp)
    ld    r23,56($sp)
    ld    r24,64($sp)
    ld    r25,72($sp)
    ld    r26,80($sp)
    ld    r27,88($sp)
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,10,3,0
    .long    0
.size    ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont

# void    ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl    ecp_nistz256_sqr_mont
.align    4
ecp_nistz256_sqr_mont:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r22,48($sp)
    std    r23,56($sp)
    std    r24,64($sp)
    std    r25,72($sp)
    std    r26,80($sp)
    std    r27,88($sp)
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $a0,0($ap)
    ld    $a1,8($ap)
    ld    $a2,16($ap)
    ld    $a3,24($ap)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_sqr_mont

    mtlr    r0
    ld    r22,48($sp)
    ld    r23,56($sp)
    ld    r24,64($sp)
    ld    r25,72($sp)
    ld    r26,80($sp)
    ld    r27,88($sp)
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,10,2,0
    .long    0
.size    ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont

# void    ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
#                    const BN_ULONG x2[4]);
.globl    ecp_nistz256_add
.align    4
ecp_nistz256_add:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $acc0,0($ap)
    ld    $t0,  0($bp)
    ld    $acc1,8($ap)
    ld    $t1,  8($bp)
    ld    $acc2,16($ap)
    ld    $t2,  16($bp)
    ld    $acc3,24($ap)
    ld    $t3,  24($bp)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_add

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,3,0
    .long    0
.size    ecp_nistz256_add,.-ecp_nistz256_add

# void    ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl    ecp_nistz256_div_by_2
.align    4
ecp_nistz256_div_by_2:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $acc0,0($ap)
    ld    $acc1,8($ap)
    ld    $acc2,16($ap)
    ld    $acc3,24($ap)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_div_by_2

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,2,0
    .long    0
.size    ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2

# void    ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl    ecp_nistz256_mul_by_2
.align    4
ecp_nistz256_mul_by_2:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $acc0,0($ap)
    ld    $acc1,8($ap)
    ld    $acc2,16($ap)
    ld    $acc3,24($ap)

    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_add    # ret = a+a    // 2*a

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,3,0
    .long    0
.size    ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2

# void    ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl    ecp_nistz256_mul_by_3
.align    4
ecp_nistz256_mul_by_3:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $acc0,0($ap)
    ld    $acc1,8($ap)
    ld    $acc2,16($ap)
    ld    $acc3,24($ap)

    mr    $t0,$acc0
    std    $acc0,64($sp)
    mr    $t1,$acc1
    std    $acc1,72($sp)
    mr    $t2,$acc2
    std    $acc2,80($sp)
    mr    $t3,$acc3
    std    $acc3,88($sp)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_add    # ret = a+a    // 2*a

    ld    $t0,64($sp)
    ld    $t1,72($sp)
    ld    $t2,80($sp)
    ld    $t3,88($sp)

    bl    __ecp_nistz256_add    # ret += a    // 2*a+a=3*a

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,2,0
    .long    0
.size    ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3

# void    ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
#                        const BN_ULONG x2[4]);
.globl    ecp_nistz256_sub
.align    4
ecp_nistz256_sub:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    ld    $acc0,0($ap)
    ld    $acc1,8($ap)
    ld    $acc2,16($ap)
    ld    $acc3,24($ap)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_sub_from

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,3,0
    .long    0
.size    ecp_nistz256_sub,.-ecp_nistz256_sub

# void    ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl    ecp_nistz256_neg
.align    4
ecp_nistz256_neg:
    stdu    $sp,-128($sp)
    mflr    r0
    std    r28,96($sp)
    std    r29,104($sp)
    std    r30,112($sp)
    std    r31,120($sp)

    mr    $bp,$ap
    li    $acc0,0
    li    $acc1,0
    li    $acc2,0
    li    $acc3,0

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    bl    __ecp_nistz256_sub_from

    mtlr    r0
    ld    r28,96($sp)
    ld    r29,104($sp)
    ld    r30,112($sp)
    ld    r31,120($sp)
    addi    $sp,$sp,128
    blr
    .long    0
    .byte    0,12,4,0,0x80,4,2,0
    .long    0
.size    ecp_nistz256_neg,.-ecp_nistz256_neg

# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
# to $a0-$a3 and b[0] - to $bi
.type    __ecp_nistz256_mul_mont,\@function
.align    4
__ecp_nistz256_mul_mont:
    mulld    $acc0,$a0,$bi        # a[0]*b[0]
    mulhdu    $t0,$a0,$bi

    mulld    $acc1,$a1,$bi        # a[1]*b[0]
    mulhdu    $t1,$a1,$bi

    mulld    $acc2,$a2,$bi        # a[2]*b[0]
    mulhdu    $t2,$a2,$bi

    mulld    $acc3,$a3,$bi        # a[3]*b[0]
    mulhdu    $t3,$a3,$bi
    ld    $bi,8($bp)        # b[1]

    addc    $acc1,$acc1,$t0        # accumulate high parts of multiplication
     sldi    $t0,$acc0,32
    adde    $acc2,$acc2,$t1
     srdi    $t1,$acc0,32
    adde    $acc3,$acc3,$t2
    addze    $acc4,$t3
    li    $acc5,0
___
for($i=1;$i<4;$i++) {
    ################################################################
    # Reduction iteration is normally performed by accumulating
    # result of multiplication of modulus by "magic" digit [and
    # omitting least significant word, which is guaranteed to
    # be 0], but thanks to special form of modulus and "magic"
    # digit being equal to least significant word, it can be
    # performed with additions and subtractions alone. Indeed:
    #
    #            ffff0001.00000000.0000ffff.ffffffff
    # *                                     abcdefgh
    # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
    #
    # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
    # rewrite above as:
    #
    #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
    # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
    # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
    #
    # or marking redundant operations:
    #
    #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
    # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
    # - 0000abcd.efgh0000.--------.--------.--------

$code.=<<___;
    subfc    $t2,$t0,$acc0        # "*0xffff0001"
    subfe    $t3,$t1,$acc0
    addc    $acc0,$acc1,$t0        # +=acc[0]<<96 and omit acc[0]
    adde    $acc1,$acc2,$t1
    adde    $acc2,$acc3,$t2        # +=acc[0]*0xffff0001
    adde    $acc3,$acc4,$t3
    addze    $acc4,$acc5

    mulld    $t0,$a0,$bi        # lo(a[0]*b[i])
    mulld    $t1,$a1,$bi        # lo(a[1]*b[i])
    mulld    $t2,$a2,$bi        # lo(a[2]*b[i])
    mulld    $t3,$a3,$bi        # lo(a[3]*b[i])
    addc    $acc0,$acc0,$t0        # accumulate low parts of multiplication
     mulhdu    $t0,$a0,$bi        # hi(a[0]*b[i])
    adde    $acc1,$acc1,$t1
     mulhdu    $t1,$a1,$bi        # hi(a[1]*b[i])
    adde    $acc2,$acc2,$t2
     mulhdu    $t2,$a2,$bi        # hi(a[2]*b[i])
    adde    $acc3,$acc3,$t3
     mulhdu    $t3,$a3,$bi        # hi(a[3]*b[i])
    addze    $acc4,$acc4
___
$code.=<<___    if ($i<3);
    ld    $bi,8*($i+1)($bp)    # b[$i+1]
___
$code.=<<___;
    addc    $acc1,$acc1,$t0        # accumulate high parts of multiplication
     sldi    $t0,$acc0,32
    adde    $acc2,$acc2,$t1
     srdi    $t1,$acc0,32
    adde    $acc3,$acc3,$t2
    adde    $acc4,$acc4,$t3
    li    $acc5,0
    addze    $acc5,$acc5
___
}
$code.=<<___;
    # last reduction
    subfc    $t2,$t0,$acc0        # "*0xffff0001"
    subfe    $t3,$t1,$acc0
    addc    $acc0,$acc1,$t0        # +=acc[0]<<96 and omit acc[0]
    adde    $acc1,$acc2,$t1
    adde    $acc2,$acc3,$t2        # +=acc[0]*0xffff0001
    adde    $acc3,$acc4,$t3
    addze    $acc4,$acc5

    li    $t2,0
    addic    $acc0,$acc0,1        # ret -= modulus
    subfe    $acc1,$poly1,$acc1
    subfe    $acc2,$t2,$acc2
    subfe    $acc3,$poly3,$acc3
    subfe    $acc4,$t2,$acc4

    addc    $acc0,$acc0,$acc4    # ret += modulus if borrow
    and    $t1,$poly1,$acc4
    and    $t3,$poly3,$acc4
    adde    $acc1,$acc1,$t1
    addze    $acc2,$acc2
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,1,0
    .long    0
.size    __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont

# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
# to $a0-$a3
.type    __ecp_nistz256_sqr_mont,\@function
.align    4
__ecp_nistz256_sqr_mont:
    ################################################################
    #  |  |  |  |  |  |a1*a0|  |
    #  |  |  |  |  |a2*a0|  |  |
    #  |  |a3*a2|a3*a0|  |  |  |
    #  |  |  |  |a2*a1|  |  |  |
    #  |  |  |a3*a1|  |  |  |  |
    # *|  |  |  |  |  |  |  | 2|
    # +|a3*a3|a2*a2|a1*a1|a0*a0|
    #  |--+--+--+--+--+--+--+--|
    #  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
    #
    #  "can't overflow" below mark carrying into high part of
    #  multiplication result, which can't overflow, because it
    #  can never be all ones.

    mulld    $acc1,$a1,$a0        # a[1]*a[0]
    mulhdu    $t1,$a1,$a0
    mulld    $acc2,$a2,$a0        # a[2]*a[0]
    mulhdu    $t2,$a2,$a0
    mulld    $acc3,$a3,$a0        # a[3]*a[0]
    mulhdu    $acc4,$a3,$a0

    addc    $acc2,$acc2,$t1        # accumulate high parts of multiplication
     mulld    $t0,$a2,$a1        # a[2]*a[1]
     mulhdu    $t1,$a2,$a1
    adde    $acc3,$acc3,$t2
     mulld    $t2,$a3,$a1        # a[3]*a[1]
     mulhdu    $t3,$a3,$a1
    addze    $acc4,$acc4        # can't overflow

    mulld    $acc5,$a3,$a2        # a[3]*a[2]
    mulhdu    $acc6,$a3,$a2

    addc    $t1,$t1,$t2        # accumulate high parts of multiplication
    addze    $t2,$t3            # can't overflow

    addc    $acc3,$acc3,$t0        # accumulate low parts of multiplication
    adde    $acc4,$acc4,$t1
    adde    $acc5,$acc5,$t2
    addze    $acc6,$acc6        # can't overflow

    addc    $acc1,$acc1,$acc1    # acc[1-6]*=2
    adde    $acc2,$acc2,$acc2
    adde    $acc3,$acc3,$acc3
    adde    $acc4,$acc4,$acc4
    adde    $acc5,$acc5,$acc5
    adde    $acc6,$acc6,$acc6
    li    $acc7,0
    addze    $acc7,$acc7

    mulld    $acc0,$a0,$a0        # a[0]*a[0]
    mulhdu    $a0,$a0,$a0
    mulld    $t1,$a1,$a1        # a[1]*a[1]
    mulhdu    $a1,$a1,$a1
    mulld    $t2,$a2,$a2        # a[2]*a[2]
    mulhdu    $a2,$a2,$a2
    mulld    $t3,$a3,$a3        # a[3]*a[3]
    mulhdu    $a3,$a3,$a3
    addc    $acc1,$acc1,$a0        # +a[i]*a[i]
     sldi    $t0,$acc0,32
    adde    $acc2,$acc2,$t1
     srdi    $t1,$acc0,32
    adde    $acc3,$acc3,$a1
    adde    $acc4,$acc4,$t2
    adde    $acc5,$acc5,$a2
    adde    $acc6,$acc6,$t3
    adde    $acc7,$acc7,$a3
___
for($i=0;$i<3;$i++) {            # reductions, see commentary in
                    # multiplication for details
$code.=<<___;
    subfc    $t2,$t0,$acc0        # "*0xffff0001"
    subfe    $t3,$t1,$acc0
    addc    $acc0,$acc1,$t0        # +=acc[0]<<96 and omit acc[0]
     sldi    $t0,$acc0,32
    adde    $acc1,$acc2,$t1
     srdi    $t1,$acc0,32
    adde    $acc2,$acc3,$t2        # +=acc[0]*0xffff0001
    addze    $acc3,$t3        # can't overflow
___
}
$code.=<<___;
    subfc    $t2,$t0,$acc0        # "*0xffff0001"
    subfe    $t3,$t1,$acc0
    addc    $acc0,$acc1,$t0        # +=acc[0]<<96 and omit acc[0]
    adde    $acc1,$acc2,$t1
    adde    $acc2,$acc3,$t2        # +=acc[0]*0xffff0001
    addze    $acc3,$t3        # can't overflow

    addc    $acc0,$acc0,$acc4    # accumulate upper half
    adde    $acc1,$acc1,$acc5
    adde    $acc2,$acc2,$acc6
    adde    $acc3,$acc3,$acc7
    li    $t2,0
    addze    $acc4,$t2

    addic    $acc0,$acc0,1        # ret -= modulus
    subfe    $acc1,$poly1,$acc1
    subfe    $acc2,$t2,$acc2
    subfe    $acc3,$poly3,$acc3
    subfe    $acc4,$t2,$acc4

    addc    $acc0,$acc0,$acc4    # ret += modulus if borrow
    and    $t1,$poly1,$acc4
    and    $t3,$poly3,$acc4
    adde    $acc1,$acc1,$t1
    addze    $acc2,$acc2
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,1,0
    .long    0
.size    __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont

# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
# contexts, e.g. in multiplication by 2 and 3...
.type    __ecp_nistz256_add,\@function
.align    4
__ecp_nistz256_add:
    addc    $acc0,$acc0,$t0        # ret = a+b
    adde    $acc1,$acc1,$t1
    adde    $acc2,$acc2,$t2
    li    $t2,0
    adde    $acc3,$acc3,$t3
    addze    $t0,$t2

    # if a+b >= modulus, subtract modulus
    #
    # But since comparison implies subtraction, we subtract
    # modulus and then add it back if subtraction borrowed.

    subic    $acc0,$acc0,-1
    subfe    $acc1,$poly1,$acc1
    subfe    $acc2,$t2,$acc2
    subfe    $acc3,$poly3,$acc3
    subfe    $t0,$t2,$t0

    addc    $acc0,$acc0,$t0
    and    $t1,$poly1,$t0
    and    $t3,$poly3,$t0
    adde    $acc1,$acc1,$t1
    addze    $acc2,$acc2
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    __ecp_nistz256_add,.-__ecp_nistz256_add

.type    __ecp_nistz256_sub_from,\@function
.align    4
__ecp_nistz256_sub_from:
    ld    $t0,0($bp)
    ld    $t1,8($bp)
    ld    $t2,16($bp)
    ld    $t3,24($bp)
    subfc    $acc0,$t0,$acc0        # ret = a-b
    subfe    $acc1,$t1,$acc1
    subfe    $acc2,$t2,$acc2
    subfe    $acc3,$t3,$acc3
    subfe    $t0,$t0,$t0        # t0 = borrow ? -1 : 0

    # if a-b borrowed, add modulus

    addc    $acc0,$acc0,$t0        # ret -= modulus & t0
    and    $t1,$poly1,$t0
    and    $t3,$poly3,$t0
    adde    $acc1,$acc1,$t1
    addze    $acc2,$acc2
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from

.type    __ecp_nistz256_sub_morf,\@function
.align    4
__ecp_nistz256_sub_morf:
    ld    $t0,0($bp)
    ld    $t1,8($bp)
    ld    $t2,16($bp)
    ld    $t3,24($bp)
    subfc    $acc0,$acc0,$t0     # ret = b-a
    subfe    $acc1,$acc1,$t1
    subfe    $acc2,$acc2,$t2
    subfe    $acc3,$acc3,$t3
    subfe    $t0,$t0,$t0        # t0 = borrow ? -1 : 0

    # if b-a borrowed, add modulus

    addc    $acc0,$acc0,$t0        # ret -= modulus & t0
    and    $t1,$poly1,$t0
    and    $t3,$poly3,$t0
    adde    $acc1,$acc1,$t1
    addze    $acc2,$acc2
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf

.type    __ecp_nistz256_div_by_2,\@function
.align    4
__ecp_nistz256_div_by_2:
    andi.    $t0,$acc0,1
    addic    $acc0,$acc0,-1        # a += modulus
     neg    $t0,$t0
    adde    $acc1,$acc1,$poly1
     not    $t0,$t0
    addze    $acc2,$acc2
     li    $t2,0
    adde    $acc3,$acc3,$poly3
     and    $t1,$poly1,$t0
    addze    $ap,$t2            # ap = carry
     and    $t3,$poly3,$t0

    subfc    $acc0,$t0,$acc0        # a -= modulus if a was even
    subfe    $acc1,$t1,$acc1
    subfe    $acc2,$t2,$acc2
    subfe    $acc3,$t3,$acc3
    subfe    $ap,  $t2,$ap

    srdi    $acc0,$acc0,1
    sldi    $t0,$acc1,63
    srdi    $acc1,$acc1,1
    sldi    $t1,$acc2,63
    srdi    $acc2,$acc2,1
    sldi    $t2,$acc3,63
    srdi    $acc3,$acc3,1
    sldi    $t3,$ap,63
    or    $acc0,$acc0,$t0
    or    $acc1,$acc1,$t1
    or    $acc2,$acc2,$t2
    or    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,1,0
    .long    0
.size    __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
___
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
if (1) {
my $FRAME=64+32*4+12*8;
my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
# above map() describes stack layout with 4 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real) = map("r$_",(20,21));

$code.=<<___;
.globl    ecp_nistz256_point_double
.align    5
ecp_nistz256_point_double:
    stdu    $sp,-$FRAME($sp)
    mflr    r0
    std    r20,$FRAME-8*12($sp)
    std    r21,$FRAME-8*11($sp)
    std    r22,$FRAME-8*10($sp)
    std    r23,$FRAME-8*9($sp)
    std    r24,$FRAME-8*8($sp)
    std    r25,$FRAME-8*7($sp)
    std    r26,$FRAME-8*6($sp)
    std    r27,$FRAME-8*5($sp)
    std    r28,$FRAME-8*4($sp)
    std    r29,$FRAME-8*3($sp)
    std    r30,$FRAME-8*2($sp)
    std    r31,$FRAME-8*1($sp)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001
.Ldouble_shortcut:
    ld    $acc0,32($ap)
    ld    $acc1,40($ap)
    ld    $acc2,48($ap)
    ld    $acc3,56($ap)
    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
     ld    $a0,64($ap)        # forward load for p256_sqr_mont
     ld    $a1,72($ap)
     ld    $a2,80($ap)
     ld    $a3,88($ap)
     mr    $rp_real,$rp
     mr    $ap_real,$ap
    addi    $rp,$sp,$S
    bl    __ecp_nistz256_add    # p256_mul_by_2(S, in_y);

    addi    $rp,$sp,$Zsqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Zsqr, in_z);

    ld    $t0,0($ap_real)
    ld    $t1,8($ap_real)
    ld    $t2,16($ap_real)
    ld    $t3,24($ap_real)
    mr    $a0,$acc0        # put Zsqr aside for p256_sub
    mr    $a1,$acc1
    mr    $a2,$acc2
    mr    $a3,$acc3
    addi    $rp,$sp,$M
    bl    __ecp_nistz256_add    # p256_add(M, Zsqr, in_x);

    addi    $bp,$ap_real,0
    mr    $acc0,$a0        # restore Zsqr
    mr    $acc1,$a1
    mr    $acc2,$a2
    mr    $acc3,$a3
     ld    $a0,$S+0($sp)        # forward load for p256_sqr_mont
     ld    $a1,$S+8($sp)
     ld    $a2,$S+16($sp)
     ld    $a3,$S+24($sp)
    addi    $rp,$sp,$Zsqr
    bl    __ecp_nistz256_sub_morf    # p256_sub(Zsqr, in_x, Zsqr);

    addi    $rp,$sp,$S
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(S, S);

    ld    $bi,32($ap_real)
    ld    $a0,64($ap_real)
    ld    $a1,72($ap_real)
    ld    $a2,80($ap_real)
    ld    $a3,88($ap_real)
    addi    $bp,$ap_real,32
    addi    $rp,$sp,$tmp0
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(tmp0, in_z, in_y);

    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
     ld    $a0,$S+0($sp)        # forward load for p256_sqr_mont
     ld    $a1,$S+8($sp)
     ld    $a2,$S+16($sp)
     ld    $a3,$S+24($sp)
    addi    $rp,$rp_real,64
    bl    __ecp_nistz256_add    # p256_mul_by_2(res_z, tmp0);

    addi    $rp,$sp,$tmp0
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(tmp0, S);

     ld    $bi,$Zsqr($sp)        # forward load for p256_mul_mont
     ld    $a0,$M+0($sp)
     ld    $a1,$M+8($sp)
     ld    $a2,$M+16($sp)
     ld    $a3,$M+24($sp)
    addi    $rp,$rp_real,32
    bl    __ecp_nistz256_div_by_2    # p256_div_by_2(res_y, tmp0);

    addi    $bp,$sp,$Zsqr
    addi    $rp,$sp,$M
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(M, M, Zsqr);

    mr    $t0,$acc0        # duplicate M
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
    mr    $a0,$acc0        # put M aside
    mr    $a1,$acc1
    mr    $a2,$acc2
    mr    $a3,$acc3
    addi    $rp,$sp,$M
    bl    __ecp_nistz256_add
    mr    $t0,$a0            # restore M
    mr    $t1,$a1
    mr    $t2,$a2
    mr    $t3,$a3
     ld    $bi,0($ap_real)        # forward load for p256_mul_mont
     ld    $a0,$S+0($sp)
     ld    $a1,$S+8($sp)
     ld    $a2,$S+16($sp)
     ld    $a3,$S+24($sp)
    bl    __ecp_nistz256_add    # p256_mul_by_3(M, M);

    addi    $bp,$ap_real,0
    addi    $rp,$sp,$S
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S, S, in_x);

    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
     ld    $a0,$M+0($sp)        # forward load for p256_sqr_mont
     ld    $a1,$M+8($sp)
     ld    $a2,$M+16($sp)
     ld    $a3,$M+24($sp)
    addi    $rp,$sp,$tmp0
    bl    __ecp_nistz256_add    # p256_mul_by_2(tmp0, S);

    addi    $rp,$rp_real,0
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(res_x, M);

    addi    $bp,$sp,$tmp0
    bl    __ecp_nistz256_sub_from    # p256_sub(res_x, res_x, tmp0);

    addi    $bp,$sp,$S
    addi    $rp,$sp,$S
    bl    __ecp_nistz256_sub_morf    # p256_sub(S, S, res_x);

    ld    $bi,$M($sp)
    mr    $a0,$acc0        # copy S
    mr    $a1,$acc1
    mr    $a2,$acc2
    mr    $a3,$acc3
    addi    $bp,$sp,$M
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S, S, M);

    addi    $bp,$rp_real,32
    addi    $rp,$rp_real,32
    bl    __ecp_nistz256_sub_from    # p256_sub(res_y, S, res_y);

    mtlr    r0
    ld    r20,$FRAME-8*12($sp)
    ld    r21,$FRAME-8*11($sp)
    ld    r22,$FRAME-8*10($sp)
    ld    r23,$FRAME-8*9($sp)
    ld    r24,$FRAME-8*8($sp)
    ld    r25,$FRAME-8*7($sp)
    ld    r26,$FRAME-8*6($sp)
    ld    r27,$FRAME-8*5($sp)
    ld    r28,$FRAME-8*4($sp)
    ld    r29,$FRAME-8*3($sp)
    ld    r30,$FRAME-8*2($sp)
    ld    r31,$FRAME-8*1($sp)
    addi    $sp,$sp,$FRAME
    blr
    .long    0
    .byte    0,12,4,0,0x80,12,2,0
    .long    0
.size    ecp_nistz256_point_double,.-ecp_nistz256_point_double
___
}

########################################################################
# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
#                  const P256_POINT *in2);
if (1) {
my $FRAME = 64 + 32*12 + 16*8;
my ($res_x,$res_y,$res_z,
    $H,$Hsqr,$R,$Rsqr,$Hcub,
    $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
# above map() describes stack layout with 12 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));

$code.=<<___;
.globl    ecp_nistz256_point_add
.align    5
ecp_nistz256_point_add:
    stdu    $sp,-$FRAME($sp)
    mflr    r0
    std    r16,$FRAME-8*16($sp)
    std    r17,$FRAME-8*15($sp)
    std    r18,$FRAME-8*14($sp)
    std    r19,$FRAME-8*13($sp)
    std    r20,$FRAME-8*12($sp)
    std    r21,$FRAME-8*11($sp)
    std    r22,$FRAME-8*10($sp)
    std    r23,$FRAME-8*9($sp)
    std    r24,$FRAME-8*8($sp)
    std    r25,$FRAME-8*7($sp)
    std    r26,$FRAME-8*6($sp)
    std    r27,$FRAME-8*5($sp)
    std    r28,$FRAME-8*4($sp)
    std    r29,$FRAME-8*3($sp)
    std    r30,$FRAME-8*2($sp)
    std    r31,$FRAME-8*1($sp)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    ld    $a0,64($bp)        # in2_z
    ld    $a1,72($bp)
    ld    $a2,80($bp)
    ld    $a3,88($bp)
     mr    $rp_real,$rp
     mr    $ap_real,$ap
     mr    $bp_real,$bp
    or    $t0,$a0,$a1
    or    $t2,$a2,$a3
    or    $in2infty,$t0,$t2
    neg    $t0,$in2infty
    or    $in2infty,$in2infty,$t0
    sradi    $in2infty,$in2infty,63    # !in2infty
    addi    $rp,$sp,$Z2sqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Z2sqr, in2_z);

    ld    $a0,64($ap_real)    # in1_z
    ld    $a1,72($ap_real)
    ld    $a2,80($ap_real)
    ld    $a3,88($ap_real)
    or    $t0,$a0,$a1
    or    $t2,$a2,$a3
    or    $in1infty,$t0,$t2
    neg    $t0,$in1infty
    or    $in1infty,$in1infty,$t0
    sradi    $in1infty,$in1infty,63    # !in1infty
    addi    $rp,$sp,$Z1sqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Z1sqr, in1_z);

    ld    $bi,64($bp_real)
    ld    $a0,$Z2sqr+0($sp)
    ld    $a1,$Z2sqr+8($sp)
    ld    $a2,$Z2sqr+16($sp)
    ld    $a3,$Z2sqr+24($sp)
    addi    $bp,$bp_real,64
    addi    $rp,$sp,$S1
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S1, Z2sqr, in2_z);

    ld    $bi,64($ap_real)
    ld    $a0,$Z1sqr+0($sp)
    ld    $a1,$Z1sqr+8($sp)
    ld    $a2,$Z1sqr+16($sp)
    ld    $a3,$Z1sqr+24($sp)
    addi    $bp,$ap_real,64
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, Z1sqr, in1_z);

    ld    $bi,32($ap_real)
    ld    $a0,$S1+0($sp)
    ld    $a1,$S1+8($sp)
    ld    $a2,$S1+16($sp)
    ld    $a3,$S1+24($sp)
    addi    $bp,$ap_real,32
    addi    $rp,$sp,$S1
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S1, S1, in1_y);

    ld    $bi,32($bp_real)
    ld    $a0,$S2+0($sp)
    ld    $a1,$S2+8($sp)
    ld    $a2,$S2+16($sp)
    ld    $a3,$S2+24($sp)
    addi    $bp,$bp_real,32
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, S2, in2_y);

    addi    $bp,$sp,$S1
     ld    $bi,$Z2sqr($sp)        # forward load for p256_mul_mont
     ld    $a0,0($ap_real)
     ld    $a1,8($ap_real)
     ld    $a2,16($ap_real)
     ld    $a3,24($ap_real)
    addi    $rp,$sp,$R
    bl    __ecp_nistz256_sub_from    # p256_sub(R, S2, S1);

    or    $acc0,$acc0,$acc1    # see if result is zero
    or    $acc2,$acc2,$acc3
    or    $temp,$acc0,$acc2

    addi    $bp,$sp,$Z2sqr
    addi    $rp,$sp,$U1
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(U1, in1_x, Z2sqr);

    ld    $bi,$Z1sqr($sp)
    ld    $a0,0($bp_real)
    ld    $a1,8($bp_real)
    ld    $a2,16($bp_real)
    ld    $a3,24($bp_real)
    addi    $bp,$sp,$Z1sqr
    addi    $rp,$sp,$U2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(U2, in2_x, Z1sqr);

    addi    $bp,$sp,$U1
     ld    $a0,$R+0($sp)        # forward load for p256_sqr_mont
     ld    $a1,$R+8($sp)
     ld    $a2,$R+16($sp)
     ld    $a3,$R+24($sp)
    addi    $rp,$sp,$H
    bl    __ecp_nistz256_sub_from    # p256_sub(H, U2, U1);

    or    $acc0,$acc0,$acc1    # see if result is zero
    or    $acc2,$acc2,$acc3
    or.    $acc0,$acc0,$acc2
    bne    .Ladd_proceed        # is_equal(U1,U2)?

    and.    $t0,$in1infty,$in2infty
    beq    .Ladd_proceed        # (in1infty || in2infty)?

    cmpldi    $temp,0
    beq    .Ladd_double        # is_equal(S1,S2)?

    xor    $a0,$a0,$a0
    std    $a0,0($rp_real)
    std    $a0,8($rp_real)
    std    $a0,16($rp_real)
    std    $a0,24($rp_real)
    std    $a0,32($rp_real)
    std    $a0,40($rp_real)
    std    $a0,48($rp_real)
    std    $a0,56($rp_real)
    std    $a0,64($rp_real)
    std    $a0,72($rp_real)
    std    $a0,80($rp_real)
    std    $a0,88($rp_real)
    b    .Ladd_done

.align    4
.Ladd_double:
    ld    $bp,0($sp)        # back-link
    mr    $ap,$ap_real
    mr    $rp,$rp_real
    ld    r16,$FRAME-8*16($sp)
    ld    r17,$FRAME-8*15($sp)
    ld    r18,$FRAME-8*14($sp)
    ld    r19,$FRAME-8*13($sp)
    stdu    $bp,$FRAME-288($sp)    # difference in stack frame sizes
    b    .Ldouble_shortcut

.align    4
.Ladd_proceed:
    addi    $rp,$sp,$Rsqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Rsqr, R);

    ld    $bi,64($ap_real)
    ld    $a0,$H+0($sp)
    ld    $a1,$H+8($sp)
    ld    $a2,$H+16($sp)
    ld    $a3,$H+24($sp)
    addi    $bp,$ap_real,64
    addi    $rp,$sp,$res_z
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(res_z, H, in1_z);

    ld    $a0,$H+0($sp)
    ld    $a1,$H+8($sp)
    ld    $a2,$H+16($sp)
    ld    $a3,$H+24($sp)
    addi    $rp,$sp,$Hsqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Hsqr, H);

    ld    $bi,64($bp_real)
    ld    $a0,$res_z+0($sp)
    ld    $a1,$res_z+8($sp)
    ld    $a2,$res_z+16($sp)
    ld    $a3,$res_z+24($sp)
    addi    $bp,$bp_real,64
    addi    $rp,$sp,$res_z
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(res_z, res_z, in2_z);

    ld    $bi,$H($sp)
    ld    $a0,$Hsqr+0($sp)
    ld    $a1,$Hsqr+8($sp)
    ld    $a2,$Hsqr+16($sp)
    ld    $a3,$Hsqr+24($sp)
    addi    $bp,$sp,$H
    addi    $rp,$sp,$Hcub
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(Hcub, Hsqr, H);

    ld    $bi,$Hsqr($sp)
    ld    $a0,$U1+0($sp)
    ld    $a1,$U1+8($sp)
    ld    $a2,$U1+16($sp)
    ld    $a3,$U1+24($sp)
    addi    $bp,$sp,$Hsqr
    addi    $rp,$sp,$U2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(U2, U1, Hsqr);

    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
    addi    $rp,$sp,$Hsqr
    bl    __ecp_nistz256_add    # p256_mul_by_2(Hsqr, U2);

    addi    $bp,$sp,$Rsqr
    addi    $rp,$sp,$res_x
    bl    __ecp_nistz256_sub_morf    # p256_sub(res_x, Rsqr, Hsqr);

    addi    $bp,$sp,$Hcub
    bl    __ecp_nistz256_sub_from    # p256_sub(res_x, res_x, Hcub);

    addi    $bp,$sp,$U2
     ld    $bi,$Hcub($sp)        # forward load for p256_mul_mont
     ld    $a0,$S1+0($sp)
     ld    $a1,$S1+8($sp)
     ld    $a2,$S1+16($sp)
     ld    $a3,$S1+24($sp)
    addi    $rp,$sp,$res_y
    bl    __ecp_nistz256_sub_morf    # p256_sub(res_y, U2, res_x);

    addi    $bp,$sp,$Hcub
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, S1, Hcub);

    ld    $bi,$R($sp)
    ld    $a0,$res_y+0($sp)
    ld    $a1,$res_y+8($sp)
    ld    $a2,$res_y+16($sp)
    ld    $a3,$res_y+24($sp)
    addi    $bp,$sp,$R
    addi    $rp,$sp,$res_y
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(res_y, res_y, R);

    addi    $bp,$sp,$S2
    bl    __ecp_nistz256_sub_from    # p256_sub(res_y, res_y, S2);

    ld    $t0,0($bp_real)        # in2
    ld    $t1,8($bp_real)
    ld    $t2,16($bp_real)
    ld    $t3,24($bp_real)
    ld    $a0,$res_x+0($sp)    # res
    ld    $a1,$res_x+8($sp)
    ld    $a2,$res_x+16($sp)
    ld    $a3,$res_x+24($sp)
___
for($i=0;$i<64;$i+=32) {        # conditional moves
$code.=<<___;
    ld    $acc0,$i+0($ap_real)    # in1
    ld    $acc1,$i+8($ap_real)
    ld    $acc2,$i+16($ap_real)
    ld    $acc3,$i+24($ap_real)
    andc    $t0,$t0,$in1infty
    andc    $t1,$t1,$in1infty
    andc    $t2,$t2,$in1infty
    andc    $t3,$t3,$in1infty
    and    $a0,$a0,$in1infty
    and    $a1,$a1,$in1infty
    and    $a2,$a2,$in1infty
    and    $a3,$a3,$in1infty
    or    $t0,$t0,$a0
    or    $t1,$t1,$a1
    or    $t2,$t2,$a2
    or    $t3,$t3,$a3
    andc    $acc0,$acc0,$in2infty
    andc    $acc1,$acc1,$in2infty
    andc    $acc2,$acc2,$in2infty
    andc    $acc3,$acc3,$in2infty
    and    $t0,$t0,$in2infty
    and    $t1,$t1,$in2infty
    and    $t2,$t2,$in2infty
    and    $t3,$t3,$in2infty
    or    $acc0,$acc0,$t0
    or    $acc1,$acc1,$t1
    or    $acc2,$acc2,$t2
    or    $acc3,$acc3,$t3

    ld    $t0,$i+32($bp_real)    # in2
    ld    $t1,$i+40($bp_real)
    ld    $t2,$i+48($bp_real)
    ld    $t3,$i+56($bp_real)
    ld    $a0,$res_x+$i+32($sp)
    ld    $a1,$res_x+$i+40($sp)
    ld    $a2,$res_x+$i+48($sp)
    ld    $a3,$res_x+$i+56($sp)
    std    $acc0,$i+0($rp_real)
    std    $acc1,$i+8($rp_real)
    std    $acc2,$i+16($rp_real)
    std    $acc3,$i+24($rp_real)
___
}
$code.=<<___;
    ld    $acc0,$i+0($ap_real)    # in1
    ld    $acc1,$i+8($ap_real)
    ld    $acc2,$i+16($ap_real)
    ld    $acc3,$i+24($ap_real)
    andc    $t0,$t0,$in1infty
    andc    $t1,$t1,$in1infty
    andc    $t2,$t2,$in1infty
    andc    $t3,$t3,$in1infty
    and    $a0,$a0,$in1infty
    and    $a1,$a1,$in1infty
    and    $a2,$a2,$in1infty
    and    $a3,$a3,$in1infty
    or    $t0,$t0,$a0
    or    $t1,$t1,$a1
    or    $t2,$t2,$a2
    or    $t3,$t3,$a3
    andc    $acc0,$acc0,$in2infty
    andc    $acc1,$acc1,$in2infty
    andc    $acc2,$acc2,$in2infty
    andc    $acc3,$acc3,$in2infty
    and    $t0,$t0,$in2infty
    and    $t1,$t1,$in2infty
    and    $t2,$t2,$in2infty
    and    $t3,$t3,$in2infty
    or    $acc0,$acc0,$t0
    or    $acc1,$acc1,$t1
    or    $acc2,$acc2,$t2
    or    $acc3,$acc3,$t3
    std    $acc0,$i+0($rp_real)
    std    $acc1,$i+8($rp_real)
    std    $acc2,$i+16($rp_real)
    std    $acc3,$i+24($rp_real)

.Ladd_done:
    mtlr    r0
    ld    r16,$FRAME-8*16($sp)
    ld    r17,$FRAME-8*15($sp)
    ld    r18,$FRAME-8*14($sp)
    ld    r19,$FRAME-8*13($sp)
    ld    r20,$FRAME-8*12($sp)
    ld    r21,$FRAME-8*11($sp)
    ld    r22,$FRAME-8*10($sp)
    ld    r23,$FRAME-8*9($sp)
    ld    r24,$FRAME-8*8($sp)
    ld    r25,$FRAME-8*7($sp)
    ld    r26,$FRAME-8*6($sp)
    ld    r27,$FRAME-8*5($sp)
    ld    r28,$FRAME-8*4($sp)
    ld    r29,$FRAME-8*3($sp)
    ld    r30,$FRAME-8*2($sp)
    ld    r31,$FRAME-8*1($sp)
    addi    $sp,$sp,$FRAME
    blr
    .long    0
    .byte    0,12,4,0,0x80,16,3,0
    .long    0
.size    ecp_nistz256_point_add,.-ecp_nistz256_point_add
___
}

########################################################################
# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
#                     const P256_POINT_AFFINE *in2);
if (1) {
my $FRAME = 64 + 32*10 + 16*8;
my ($res_x,$res_y,$res_z,
    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
my $Z1sqr = $S2;
# above map() describes stack layout with 10 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));

$code.=<<___;
.globl    ecp_nistz256_point_add_affine
.align    5
ecp_nistz256_point_add_affine:
    stdu    $sp,-$FRAME($sp)
    mflr    r0
    std    r16,$FRAME-8*16($sp)
    std    r17,$FRAME-8*15($sp)
    std    r18,$FRAME-8*14($sp)
    std    r19,$FRAME-8*13($sp)
    std    r20,$FRAME-8*12($sp)
    std    r21,$FRAME-8*11($sp)
    std    r22,$FRAME-8*10($sp)
    std    r23,$FRAME-8*9($sp)
    std    r24,$FRAME-8*8($sp)
    std    r25,$FRAME-8*7($sp)
    std    r26,$FRAME-8*6($sp)
    std    r27,$FRAME-8*5($sp)
    std    r28,$FRAME-8*4($sp)
    std    r29,$FRAME-8*3($sp)
    std    r30,$FRAME-8*2($sp)
    std    r31,$FRAME-8*1($sp)

    li    $poly1,-1
    srdi    $poly1,$poly1,32    # 0x00000000ffffffff
    li    $poly3,1
    orc    $poly3,$poly3,$poly1    # 0xffffffff00000001

    mr    $rp_real,$rp
    mr    $ap_real,$ap
    mr    $bp_real,$bp

    ld    $a0,64($ap)        # in1_z
    ld    $a1,72($ap)
    ld    $a2,80($ap)
    ld    $a3,88($ap)
    or    $t0,$a0,$a1
    or    $t2,$a2,$a3
    or    $in1infty,$t0,$t2
    neg    $t0,$in1infty
    or    $in1infty,$in1infty,$t0
    sradi    $in1infty,$in1infty,63    # !in1infty

    ld    $acc0,0($bp)        # in2_x
    ld    $acc1,8($bp)
    ld    $acc2,16($bp)
    ld    $acc3,24($bp)
    ld    $t0,32($bp)        # in2_y
    ld    $t1,40($bp)
    ld    $t2,48($bp)
    ld    $t3,56($bp)
    or    $acc0,$acc0,$acc1
    or    $acc2,$acc2,$acc3
    or    $acc0,$acc0,$acc2
    or    $t0,$t0,$t1
    or    $t2,$t2,$t3
    or    $t0,$t0,$t2
    or    $in2infty,$acc0,$t0
    neg    $t0,$in2infty
    or    $in2infty,$in2infty,$t0
    sradi    $in2infty,$in2infty,63    # !in2infty

    addi    $rp,$sp,$Z1sqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Z1sqr, in1_z);

    mr    $a0,$acc0
    mr    $a1,$acc1
    mr    $a2,$acc2
    mr    $a3,$acc3
    ld    $bi,0($bp_real)
    addi    $bp,$bp_real,0
    addi    $rp,$sp,$U2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(U2, Z1sqr, in2_x);

    addi    $bp,$ap_real,0
     ld    $bi,64($ap_real)    # forward load for p256_mul_mont
     ld    $a0,$Z1sqr+0($sp)
     ld    $a1,$Z1sqr+8($sp)
     ld    $a2,$Z1sqr+16($sp)
     ld    $a3,$Z1sqr+24($sp)
    addi    $rp,$sp,$H
    bl    __ecp_nistz256_sub_from    # p256_sub(H, U2, in1_x);

    addi    $bp,$ap_real,64
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, Z1sqr, in1_z);

    ld    $bi,64($ap_real)
    ld    $a0,$H+0($sp)
    ld    $a1,$H+8($sp)
    ld    $a2,$H+16($sp)
    ld    $a3,$H+24($sp)
    addi    $bp,$ap_real,64
    addi    $rp,$sp,$res_z
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(res_z, H, in1_z);

    ld    $bi,32($bp_real)
    ld    $a0,$S2+0($sp)
    ld    $a1,$S2+8($sp)
    ld    $a2,$S2+16($sp)
    ld    $a3,$S2+24($sp)
    addi    $bp,$bp_real,32
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, S2, in2_y);

    addi    $bp,$ap_real,32
     ld    $a0,$H+0($sp)        # forward load for p256_sqr_mont
     ld    $a1,$H+8($sp)
     ld    $a2,$H+16($sp)
     ld    $a3,$H+24($sp)
    addi    $rp,$sp,$R
    bl    __ecp_nistz256_sub_from    # p256_sub(R, S2, in1_y);

    addi    $rp,$sp,$Hsqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Hsqr, H);

    ld    $a0,$R+0($sp)
    ld    $a1,$R+8($sp)
    ld    $a2,$R+16($sp)
    ld    $a3,$R+24($sp)
    addi    $rp,$sp,$Rsqr
    bl    __ecp_nistz256_sqr_mont    # p256_sqr_mont(Rsqr, R);

    ld    $bi,$H($sp)
    ld    $a0,$Hsqr+0($sp)
    ld    $a1,$Hsqr+8($sp)
    ld    $a2,$Hsqr+16($sp)
    ld    $a3,$Hsqr+24($sp)
    addi    $bp,$sp,$H
    addi    $rp,$sp,$Hcub
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(Hcub, Hsqr, H);

    ld    $bi,0($ap_real)
    ld    $a0,$Hsqr+0($sp)
    ld    $a1,$Hsqr+8($sp)
    ld    $a2,$Hsqr+16($sp)
    ld    $a3,$Hsqr+24($sp)
    addi    $bp,$ap_real,0
    addi    $rp,$sp,$U2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(U2, in1_x, Hsqr);

    mr    $t0,$acc0
    mr    $t1,$acc1
    mr    $t2,$acc2
    mr    $t3,$acc3
    addi    $rp,$sp,$Hsqr
    bl    __ecp_nistz256_add    # p256_mul_by_2(Hsqr, U2);

    addi    $bp,$sp,$Rsqr
    addi    $rp,$sp,$res_x
    bl    __ecp_nistz256_sub_morf    # p256_sub(res_x, Rsqr, Hsqr);

    addi    $bp,$sp,$Hcub
    bl    __ecp_nistz256_sub_from    #  p256_sub(res_x, res_x, Hcub);

    addi    $bp,$sp,$U2
     ld    $bi,32($ap_real)    # forward load for p256_mul_mont
     ld    $a0,$Hcub+0($sp)
     ld    $a1,$Hcub+8($sp)
     ld    $a2,$Hcub+16($sp)
     ld    $a3,$Hcub+24($sp)
    addi    $rp,$sp,$res_y
    bl    __ecp_nistz256_sub_morf    # p256_sub(res_y, U2, res_x);

    addi    $bp,$ap_real,32
    addi    $rp,$sp,$S2
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(S2, in1_y, Hcub);

    ld    $bi,$R($sp)
    ld    $a0,$res_y+0($sp)
    ld    $a1,$res_y+8($sp)
    ld    $a2,$res_y+16($sp)
    ld    $a3,$res_y+24($sp)
    addi    $bp,$sp,$R
    addi    $rp,$sp,$res_y
    bl    __ecp_nistz256_mul_mont    # p256_mul_mont(res_y, res_y, R);

    addi    $bp,$sp,$S2
    bl    __ecp_nistz256_sub_from    # p256_sub(res_y, res_y, S2);

    ld    $t0,0($bp_real)        # in2
    ld    $t1,8($bp_real)
    ld    $t2,16($bp_real)
    ld    $t3,24($bp_real)
    ld    $a0,$res_x+0($sp)    # res
    ld    $a1,$res_x+8($sp)
    ld    $a2,$res_x+16($sp)
    ld    $a3,$res_x+24($sp)
___
for($i=0;$i<64;$i+=32) {        # conditional moves
$code.=<<___;
    ld    $acc0,$i+0($ap_real)    # in1
    ld    $acc1,$i+8($ap_real)
    ld    $acc2,$i+16($ap_real)
    ld    $acc3,$i+24($ap_real)
    andc    $t0,$t0,$in1infty
    andc    $t1,$t1,$in1infty
    andc    $t2,$t2,$in1infty
    andc    $t3,$t3,$in1infty
    and    $a0,$a0,$in1infty
    and    $a1,$a1,$in1infty
    and    $a2,$a2,$in1infty
    and    $a3,$a3,$in1infty
    or    $t0,$t0,$a0
    or    $t1,$t1,$a1
    or    $t2,$t2,$a2
    or    $t3,$t3,$a3
    andc    $acc0,$acc0,$in2infty
    andc    $acc1,$acc1,$in2infty
    andc    $acc2,$acc2,$in2infty
    andc    $acc3,$acc3,$in2infty
    and    $t0,$t0,$in2infty
    and    $t1,$t1,$in2infty
    and    $t2,$t2,$in2infty
    and    $t3,$t3,$in2infty
    or    $acc0,$acc0,$t0
    or    $acc1,$acc1,$t1
    or    $acc2,$acc2,$t2
    or    $acc3,$acc3,$t3
___
$code.=<<___    if ($i==0);
    ld    $t0,32($bp_real)    # in2
    ld    $t1,40($bp_real)
    ld    $t2,48($bp_real)
    ld    $t3,56($bp_real)
___
$code.=<<___    if ($i==32);
    li    $t0,1            # Lone_mont
    not    $t1,$poly1
    li    $t2,-1
    not    $t3,$poly3
___
$code.=<<___;
    ld    $a0,$res_x+$i+32($sp)
    ld    $a1,$res_x+$i+40($sp)
    ld    $a2,$res_x+$i+48($sp)
    ld    $a3,$res_x+$i+56($sp)
    std    $acc0,$i+0($rp_real)
    std    $acc1,$i+8($rp_real)
    std    $acc2,$i+16($rp_real)
    std    $acc3,$i+24($rp_real)
___
}
$code.=<<___;
    ld    $acc0,$i+0($ap_real)    # in1
    ld    $acc1,$i+8($ap_real)
    ld    $acc2,$i+16($ap_real)
    ld    $acc3,$i+24($ap_real)
    andc    $t0,$t0,$in1infty
    andc    $t1,$t1,$in1infty
    andc    $t2,$t2,$in1infty
    andc    $t3,$t3,$in1infty
    and    $a0,$a0,$in1infty
    and    $a1,$a1,$in1infty
    and    $a2,$a2,$in1infty
    and    $a3,$a3,$in1infty
    or    $t0,$t0,$a0
    or    $t1,$t1,$a1
    or    $t2,$t2,$a2
    or    $t3,$t3,$a3
    andc    $acc0,$acc0,$in2infty
    andc    $acc1,$acc1,$in2infty
    andc    $acc2,$acc2,$in2infty
    andc    $acc3,$acc3,$in2infty
    and    $t0,$t0,$in2infty
    and    $t1,$t1,$in2infty
    and    $t2,$t2,$in2infty
    and    $t3,$t3,$in2infty
    or    $acc0,$acc0,$t0
    or    $acc1,$acc1,$t1
    or    $acc2,$acc2,$t2
    or    $acc3,$acc3,$t3
    std    $acc0,$i+0($rp_real)
    std    $acc1,$i+8($rp_real)
    std    $acc2,$i+16($rp_real)
    std    $acc3,$i+24($rp_real)

    mtlr    r0
    ld    r16,$FRAME-8*16($sp)
    ld    r17,$FRAME-8*15($sp)
    ld    r18,$FRAME-8*14($sp)
    ld    r19,$FRAME-8*13($sp)
    ld    r20,$FRAME-8*12($sp)
    ld    r21,$FRAME-8*11($sp)
    ld    r22,$FRAME-8*10($sp)
    ld    r23,$FRAME-8*9($sp)
    ld    r24,$FRAME-8*8($sp)
    ld    r25,$FRAME-8*7($sp)
    ld    r26,$FRAME-8*6($sp)
    ld    r27,$FRAME-8*5($sp)
    ld    r28,$FRAME-8*4($sp)
    ld    r29,$FRAME-8*3($sp)
    ld    r30,$FRAME-8*2($sp)
    ld    r31,$FRAME-8*1($sp)
    addi    $sp,$sp,$FRAME
    blr
    .long    0
    .byte    0,12,4,0,0x80,16,3,0
    .long    0
.size    ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
___
}
if (1) {
my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");

$code.=<<___;
########################################################################
# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
#                                uint64_t b[4]);
.globl    ecp_nistz256_ord_mul_mont
.align    5
ecp_nistz256_ord_mul_mont:
    stdu    $sp,-160($sp)
    std    r18,48($sp)
    std    r19,56($sp)
    std    r20,64($sp)
    std    r21,72($sp)
    std    r22,80($sp)
    std    r23,88($sp)
    std    r24,96($sp)
    std    r25,104($sp)
    std    r26,112($sp)
    std    r27,120($sp)
    std    r28,128($sp)
    std    r29,136($sp)
    std    r30,144($sp)
    std    r31,152($sp)

    ld    $a0,0($ap)
    ld    $bi,0($bp)
    ld    $a1,8($ap)
    ld    $a2,16($ap)
    ld    $a3,24($ap)

    lis    $ordk,0xccd1
    lis    $ord0,0xf3b9
    lis    $ord1,0xbce6
    ori    $ordk,$ordk,0xc8aa
    ori    $ord0,$ord0,0xcac2
    ori    $ord1,$ord1,0xfaad
    sldi    $ordk,$ordk,32
    sldi    $ord0,$ord0,32
    sldi    $ord1,$ord1,32
    oris    $ordk,$ordk,0xee00
    oris    $ord0,$ord0,0xfc63
    oris    $ord1,$ord1,0xa717
    ori    $ordk,$ordk,0xbc4f    # 0xccd1c8aaee00bc4f
    ori    $ord0,$ord0,0x2551    # 0xf3b9cac2fc632551
    ori    $ord1,$ord1,0x9e84    # 0xbce6faada7179e84
    li    $ord2,-1        # 0xffffffffffffffff
    sldi    $ord3,$ord2,32        # 0xffffffff00000000
    li    $zr,0

    mulld    $acc0,$a0,$bi        # a[0]*b[0]
    mulhdu    $t0,$a0,$bi

    mulld    $acc1,$a1,$bi        # a[1]*b[0]
    mulhdu    $t1,$a1,$bi

    mulld    $acc2,$a2,$bi        # a[2]*b[0]
    mulhdu    $t2,$a2,$bi

    mulld    $acc3,$a3,$bi        # a[3]*b[0]
    mulhdu    $acc4,$a3,$bi

    mulld    $t4,$acc0,$ordk

    addc    $acc1,$acc1,$t0        # accumulate high parts of multiplication
    adde    $acc2,$acc2,$t1
    adde    $acc3,$acc3,$t2
    addze    $acc4,$acc4
    li    $acc5,0
___
for ($i=1;$i<4;$i++) {
    ################################################################
    #            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
    # *                                     abcdefgh
    # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
    #
    # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
    # rewrite above as:
    #
    #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
    # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
    # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
$code.=<<___;
    ld    $bi,8*$i($bp)        # b[i]

    sldi    $t0,$t4,32
    subfc    $acc2,$t4,$acc2
    srdi    $t1,$t4,32
    subfe    $acc3,$t0,$acc3
    subfe    $acc4,$t1,$acc4
    subfe    $acc5,$zr,$acc5

    addic    $t0,$acc0,-1        # discarded
    mulhdu    $t1,$ord0,$t4
    mulld    $t2,$ord1,$t4
    mulhdu    $t3,$ord1,$t4

    adde    $t2,$t2,$t1
     mulld    $t0,$a0,$bi
    addze    $t3,$t3
     mulld    $t1,$a1,$bi

    addc    $acc0,$acc1,$t2
     mulld    $t2,$a2,$bi
    adde    $acc1,$acc2,$t3
     mulld    $t3,$a3,$bi
    adde    $acc2,$acc3,$t4
    adde    $acc3,$acc4,$t4
    addze    $acc4,$acc5

    addc    $acc0,$acc0,$t0        # accumulate low parts
    mulhdu    $t0,$a0,$bi
    adde    $acc1,$acc1,$t1
    mulhdu    $t1,$a1,$bi
    adde    $acc2,$acc2,$t2
    mulhdu    $t2,$a2,$bi
    adde    $acc3,$acc3,$t3
    mulhdu    $t3,$a3,$bi
    addze    $acc4,$acc4
    mulld    $t4,$acc0,$ordk
    addc    $acc1,$acc1,$t0        # accumulate high parts
    adde    $acc2,$acc2,$t1
    adde    $acc3,$acc3,$t2
    adde    $acc4,$acc4,$t3
    addze    $acc5,$zr
___
}
$code.=<<___;
    sldi    $t0,$t4,32        # last reduction
    subfc    $acc2,$t4,$acc2
    srdi    $t1,$t4,32
    subfe    $acc3,$t0,$acc3
    subfe    $acc4,$t1,$acc4
    subfe    $acc5,$zr,$acc5

    addic    $t0,$acc0,-1        # discarded
    mulhdu    $t1,$ord0,$t4
    mulld    $t2,$ord1,$t4
    mulhdu    $t3,$ord1,$t4

    adde    $t2,$t2,$t1
    addze    $t3,$t3

    addc    $acc0,$acc1,$t2
    adde    $acc1,$acc2,$t3
    adde    $acc2,$acc3,$t4
    adde    $acc3,$acc4,$t4
    addze    $acc4,$acc5

    subfc    $acc0,$ord0,$acc0    # ret -= modulus
    subfe    $acc1,$ord1,$acc1
    subfe    $acc2,$ord2,$acc2
    subfe    $acc3,$ord3,$acc3
    subfe    $acc4,$zr,$acc4

    and    $t0,$ord0,$acc4
    and    $t1,$ord1,$acc4
    addc    $acc0,$acc0,$t0        # ret += modulus if borrow
    and    $t3,$ord3,$acc4
    adde    $acc1,$acc1,$t1
    adde    $acc2,$acc2,$acc4
    adde    $acc3,$acc3,$t3

    std    $acc0,0($rp)
    std    $acc1,8($rp)
    std    $acc2,16($rp)
    std    $acc3,24($rp)

    ld    r18,48($sp)
    ld    r19,56($sp)
    ld    r20,64($sp)
    ld    r21,72($sp)
    ld    r22,80($sp)
    ld    r23,88($sp)
    ld    r24,96($sp)
    ld    r25,104($sp)
    ld    r26,112($sp)
    ld    r27,120($sp)
    ld    r28,128($sp)
    ld    r29,136($sp)
    ld    r30,144($sp)
    ld    r31,152($sp)
    addi    $sp,$sp,160
    blr
    .long    0
    .byte    0,12,4,0,0x80,14,3,0
    .long    0
.size    ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont

################################################################################
# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
#                                uint64_t rep);
.globl    ecp_nistz256_ord_sqr_mont
.align    5
ecp_nistz256_ord_sqr_mont:
    stdu    $sp,-160($sp)
    std    r18,48($sp)
    std    r19,56($sp)
    std    r20,64($sp)
    std    r21,72($sp)
    std    r22,80($sp)
    std    r23,88($sp)
    std    r24,96($sp)
    std    r25,104($sp)
    std    r26,112($sp)
    std    r27,120($sp)
    std    r28,128($sp)
    std    r29,136($sp)
    std    r30,144($sp)
    std    r31,152($sp)

    mtctr    $bp

    ld    $a0,0($ap)
    ld    $a1,8($ap)
    ld    $a2,16($ap)
    ld    $a3,24($ap)

    lis    $ordk,0xccd1
    lis    $ord0,0xf3b9
    lis    $ord1,0xbce6
    ori    $ordk,$ordk,0xc8aa
    ori    $ord0,$ord0,0xcac2
    ori    $ord1,$ord1,0xfaad
    sldi    $ordk,$ordk,32
    sldi    $ord0,$ord0,32
    sldi    $ord1,$ord1,32
    oris    $ordk,$ordk,0xee00
    oris    $ord0,$ord0,0xfc63
    oris    $ord1,$ord1,0xa717
    ori    $ordk,$ordk,0xbc4f    # 0xccd1c8aaee00bc4f
    ori    $ord0,$ord0,0x2551    # 0xf3b9cac2fc632551
    ori    $ord1,$ord1,0x9e84    # 0xbce6faada7179e84
    li    $ord2,-1        # 0xffffffffffffffff
    sldi    $ord3,$ord2,32        # 0xffffffff00000000
    li    $zr,0
    b    .Loop_ord_sqr

.align    5
.Loop_ord_sqr:
    ################################################################
    #  |  |  |  |  |  |a1*a0|  |
    #  |  |  |  |  |a2*a0|  |  |
    #  |  |a3*a2|a3*a0|  |  |  |
    #  |  |  |  |a2*a1|  |  |  |
    #  |  |  |a3*a1|  |  |  |  |
    # *|  |  |  |  |  |  |  | 2|
    # +|a3*a3|a2*a2|a1*a1|a0*a0|
    #  |--+--+--+--+--+--+--+--|
    #  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
    #
    #  "can't overflow" below mark carrying into high part of
    #  multiplication result, which can't overflow, because it
    #  can never be all ones.

    mulld    $acc1,$a1,$a0        # a[1]*a[0]
    mulhdu    $t1,$a1,$a0
    mulld    $acc2,$a2,$a0        # a[2]*a[0]
    mulhdu    $t2,$a2,$a0
    mulld    $acc3,$a3,$a0        # a[3]*a[0]
    mulhdu    $acc4,$a3,$a0

    addc    $acc2,$acc2,$t1        # accumulate high parts of multiplication
     mulld    $t0,$a2,$a1        # a[2]*a[1]
     mulhdu    $t1,$a2,$a1
    adde    $acc3,$acc3,$t2
     mulld    $t2,$a3,$a1        # a[3]*a[1]
     mulhdu    $t3,$a3,$a1
    addze    $acc4,$acc4        # can't overflow

    mulld    $acc5,$a3,$a2        # a[3]*a[2]
    mulhdu    $acc6,$a3,$a2

    addc    $t1,$t1,$t2        # accumulate high parts of multiplication
     mulld    $acc0,$a0,$a0        # a[0]*a[0]
    addze    $t2,$t3            # can't overflow

    addc    $acc3,$acc3,$t0        # accumulate low parts of multiplication
     mulhdu    $a0,$a0,$a0
    adde    $acc4,$acc4,$t1
     mulld    $t1,$a1,$a1        # a[1]*a[1]
    adde    $acc5,$acc5,$t2
     mulhdu    $a1,$a1,$a1
    addze    $acc6,$acc6        # can't overflow

    addc    $acc1,$acc1,$acc1    # acc[1-6]*=2
     mulld    $t2,$a2,$a2        # a[2]*a[2]
    adde    $acc2,$acc2,$acc2
     mulhdu    $a2,$a2,$a2
    adde    $acc3,$acc3,$acc3
     mulld    $t3,$a3,$a3        # a[3]*a[3]
    adde    $acc4,$acc4,$acc4
     mulhdu    $a3,$a3,$a3
    adde    $acc5,$acc5,$acc5
    adde    $acc6,$acc6,$acc6
    addze    $acc7,$zr

    addc    $acc1,$acc1,$a0        # +a[i]*a[i]
     mulld    $t4,$acc0,$ordk
    adde    $acc2,$acc2,$t1
    adde    $acc3,$acc3,$a1
    adde    $acc4,$acc4,$t2
    adde    $acc5,$acc5,$a2
    adde    $acc6,$acc6,$t3
    adde    $acc7,$acc7,$a3
___
for($i=0; $i<4; $i++) {            # reductions
$code.=<<___;
    addic    $t0,$acc0,-1        # discarded
    mulhdu    $t1,$ord0,$t4
    mulld    $t2,$ord1,$t4
    mulhdu    $t3,$ord1,$t4

    adde    $t2,$t2,$t1
    addze    $t3,$t3

    addc    $acc0,$acc1,$t2
    adde    $acc1,$acc2,$t3
    adde    $acc2,$acc3,$t4
    adde    $acc3,$zr,$t4        # can't overflow
___
$code.=<<___    if ($i<3);
    mulld    $t3,$acc0,$ordk
___
$code.=<<___;
    sldi    $t0,$t4,32
    subfc    $acc1,$t4,$acc1
    srdi    $t1,$t4,32
    subfe    $acc2,$t0,$acc2
    subfe    $acc3,$t1,$acc3        # can't borrow
___
    ($t3,$t4) = ($t4,$t3);
}
$code.=<<___;
    addc    $acc0,$acc0,$acc4    # accumulate upper half
    adde    $acc1,$acc1,$acc5
    adde    $acc2,$acc2,$acc6
    adde    $acc3,$acc3,$acc7
    addze    $acc4,$zr

    subfc    $acc0,$ord0,$acc0    # ret -= modulus
    subfe    $acc1,$ord1,$acc1
    subfe    $acc2,$ord2,$acc2
    subfe    $acc3,$ord3,$acc3
    subfe    $acc4,$zr,$acc4

    and    $t0,$ord0,$acc4
    and    $t1,$ord1,$acc4
    addc    $a0,$acc0,$t0        # ret += modulus if borrow
    and    $t3,$ord3,$acc4
    adde    $a1,$acc1,$t1
    adde    $a2,$acc2,$acc4
    adde    $a3,$acc3,$t3

    bdnz    .Loop_ord_sqr

    std    $a0,0($rp)
    std    $a1,8($rp)
    std    $a2,16($rp)
    std    $a3,24($rp)

    ld    r18,48($sp)
    ld    r19,56($sp)
    ld    r20,64($sp)
    ld    r21,72($sp)
    ld    r22,80($sp)
    ld    r23,88($sp)
    ld    r24,96($sp)
    ld    r25,104($sp)
    ld    r26,112($sp)
    ld    r27,120($sp)
    ld    r28,128($sp)
    ld    r29,136($sp)
    ld    r30,144($sp)
    ld    r31,152($sp)
    addi    $sp,$sp,160
    blr
    .long    0
    .byte    0,12,4,0,0x80,14,3,0
    .long    0
.size    ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
___
}    }

########################################################################
# scatter-gather subroutines
{
my ($out,$inp,$index,$mask)=map("r$_",(3..7));
$code.=<<___;
########################################################################
# void    ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
#                int index);
.globl    ecp_nistz256_scatter_w5
.align    4
ecp_nistz256_scatter_w5:
    slwi    $index,$index,2
    add    $out,$out,$index

    ld    r8, 0($inp)        # X
    ld    r9, 8($inp)
    ld    r10,16($inp)
    ld    r11,24($inp)

    stw    r8, 64*0-4($out)
    srdi    r8, r8, 32
    stw    r9, 64*1-4($out)
    srdi    r9, r9, 32
    stw    r10,64*2-4($out)
    srdi    r10,r10,32
    stw    r11,64*3-4($out)
    srdi    r11,r11,32
    stw    r8, 64*4-4($out)
    stw    r9, 64*5-4($out)
    stw    r10,64*6-4($out)
    stw    r11,64*7-4($out)
    addi    $out,$out,64*8

    ld    r8, 32($inp)        # Y
    ld    r9, 40($inp)
    ld    r10,48($inp)
    ld    r11,56($inp)

    stw    r8, 64*0-4($out)
    srdi    r8, r8, 32
    stw    r9, 64*1-4($out)
    srdi    r9, r9, 32
    stw    r10,64*2-4($out)
    srdi    r10,r10,32
    stw    r11,64*3-4($out)
    srdi    r11,r11,32
    stw    r8, 64*4-4($out)
    stw    r9, 64*5-4($out)
    stw    r10,64*6-4($out)
    stw    r11,64*7-4($out)
    addi    $out,$out,64*8

    ld    r8, 64($inp)        # Z
    ld    r9, 72($inp)
    ld    r10,80($inp)
    ld    r11,88($inp)

    stw    r8, 64*0-4($out)
    srdi    r8, r8, 32
    stw    r9, 64*1-4($out)
    srdi    r9, r9, 32
    stw    r10,64*2-4($out)
    srdi    r10,r10,32
    stw    r11,64*3-4($out)
    srdi    r11,r11,32
    stw    r8, 64*4-4($out)
    stw    r9, 64*5-4($out)
    stw    r10,64*6-4($out)
    stw    r11,64*7-4($out)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5

########################################################################
# void    ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
#                int index);
.globl    ecp_nistz256_gather_w5
.align    4
ecp_nistz256_gather_w5:
    neg    r0,$index
    sradi    r0,r0,63

    add    $index,$index,r0
    slwi    $index,$index,2
    add    $inp,$inp,$index

    lwz    r5, 64*0($inp)
    lwz    r6, 64*1($inp)
    lwz    r7, 64*2($inp)
    lwz    r8, 64*3($inp)
    lwz    r9, 64*4($inp)
    lwz    r10,64*5($inp)
    lwz    r11,64*6($inp)
    lwz    r12,64*7($inp)
    addi    $inp,$inp,64*8
    sldi    r9, r9, 32
    sldi    r10,r10,32
    sldi    r11,r11,32
    sldi    r12,r12,32
    or    r5,r5,r9
    or    r6,r6,r10
    or    r7,r7,r11
    or    r8,r8,r12
    and    r5,r5,r0
    and    r6,r6,r0
    and    r7,r7,r0
    and    r8,r8,r0
    std    r5,0($out)        # X
    std    r6,8($out)
    std    r7,16($out)
    std    r8,24($out)

    lwz    r5, 64*0($inp)
    lwz    r6, 64*1($inp)
    lwz    r7, 64*2($inp)
    lwz    r8, 64*3($inp)
    lwz    r9, 64*4($inp)
    lwz    r10,64*5($inp)
    lwz    r11,64*6($inp)
    lwz    r12,64*7($inp)
    addi    $inp,$inp,64*8
    sldi    r9, r9, 32
    sldi    r10,r10,32
    sldi    r11,r11,32
    sldi    r12,r12,32
    or    r5,r5,r9
    or    r6,r6,r10
    or    r7,r7,r11
    or    r8,r8,r12
    and    r5,r5,r0
    and    r6,r6,r0
    and    r7,r7,r0
    and    r8,r8,r0
    std    r5,32($out)        # Y
    std    r6,40($out)
    std    r7,48($out)
    std    r8,56($out)

    lwz    r5, 64*0($inp)
    lwz    r6, 64*1($inp)
    lwz    r7, 64*2($inp)
    lwz    r8, 64*3($inp)
    lwz    r9, 64*4($inp)
    lwz    r10,64*5($inp)
    lwz    r11,64*6($inp)
    lwz    r12,64*7($inp)
    sldi    r9, r9, 32
    sldi    r10,r10,32
    sldi    r11,r11,32
    sldi    r12,r12,32
    or    r5,r5,r9
    or    r6,r6,r10
    or    r7,r7,r11
    or    r8,r8,r12
    and    r5,r5,r0
    and    r6,r6,r0
    and    r7,r7,r0
    and    r8,r8,r0
    std    r5,64($out)        # Z
    std    r6,72($out)
    std    r7,80($out)
    std    r8,88($out)

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5

########################################################################
# void    ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
#                int index);
.globl    ecp_nistz256_scatter_w7
.align    4
ecp_nistz256_scatter_w7:
    li    r0,8
    mtctr    r0
    add    $out,$out,$index
    subi    $inp,$inp,8

.Loop_scatter_w7:
    ldu    r0,8($inp)
    stb    r0,64*0($out)
    srdi    r0,r0,8
    stb    r0,64*1($out)
    srdi    r0,r0,8
    stb    r0,64*2($out)
    srdi    r0,r0,8
    stb    r0,64*3($out)
    srdi    r0,r0,8
    stb    r0,64*4($out)
    srdi    r0,r0,8
    stb    r0,64*5($out)
    srdi    r0,r0,8
    stb    r0,64*6($out)
    srdi    r0,r0,8
    stb    r0,64*7($out)
    addi    $out,$out,64*8
    bdnz    .Loop_scatter_w7

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7

########################################################################
# void    ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
#                int index);
.globl    ecp_nistz256_gather_w7
.align    4
ecp_nistz256_gather_w7:
    li    r0,8
    mtctr    r0
    neg    r0,$index
    sradi    r0,r0,63

    add    $index,$index,r0
    add    $inp,$inp,$index
    subi    $out,$out,8

.Loop_gather_w7:
    lbz    r5, 64*0($inp)
    lbz    r6, 64*1($inp)
    lbz    r7, 64*2($inp)
    lbz    r8, 64*3($inp)
    lbz    r9, 64*4($inp)
    lbz    r10,64*5($inp)
    lbz    r11,64*6($inp)
    lbz    r12,64*7($inp)
    addi    $inp,$inp,64*8

    sldi    r6, r6, 8
    sldi    r7, r7, 16
    sldi    r8, r8, 24
    sldi    r9, r9, 32
    sldi    r10,r10,40
    sldi    r11,r11,48
    sldi    r12,r12,56

    or    r5,r5,r6
    or    r7,r7,r8
    or    r9,r9,r10
    or    r11,r11,r12
    or    r5,r5,r7
    or    r9,r9,r11
    or    r5,r5,r9
    and    r5,r5,r0
    stdu    r5,8($out)
    bdnz    .Loop_gather_w7

    blr
    .long    0
    .byte    0,12,0x14,0,0,0,3,0
    .long    0
.size    ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}

foreach (split("\n",$code)) {
    s/\`([^\`]*)\`/eval $1/ge;

    print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";    # enforce flush
:: Command execute ::
Enter:	Select: