#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);

@mod = map("x$_",(5..10));
@a   = map("x$_",(11..16));
$bi  = "x17";
@acc = map("x$_",(19..25));
@tmp = map("x$_",(26..28,0,1,3));

$code.=<<___;
.text

.globl	add_mod_384x384
.type	add_mod_384x384,%function
.align	5
add_mod_384x384:
	paciasp
	stp	x29,x30,[sp,#-64]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__add_mod_384x384
	ldr	x30,[x29,#8]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldr	x29,[sp],#64
	autiasp
	ret
.size	add_mod_384x384,.-add_mod_384x384

.type	__add_mod_384x384,%function
.align	5
__add_mod_384x384:
	ldp	@a[0],  @a[1],  [$a_ptr]
	ldp	@acc[0],@acc[1],[$b_ptr]
	ldp	@a[2],  @a[3],  [$a_ptr,#16]
	adds	@a[0],@a[0],@acc[0]
	ldp	@acc[2],@acc[3],[$b_ptr,#16]
	adcs	@a[1],@a[1],@acc[1]
	ldp	@a[4],  @a[5],  [$a_ptr,#32]
	adcs	@a[2],@a[2],@acc[2]
	ldp	@acc[4],@acc[5],[$b_ptr,#32]
	adcs	@a[3],@a[3],@acc[3]
	 stp	@a[0],  @a[1],  [$r_ptr]
	adcs	@a[4],@a[4],@acc[4]
	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
	adcs	@a[5],@a[5],@acc[5]

	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
	 stp	@a[2],  @a[3],  [$r_ptr,#16]
	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
	 ldp	@acc[2],@acc[3],[$b_ptr,#64]

	adcs	@a[0],@a[0],@acc[0]
	 stp	@a[4],  @a[5],  [$r_ptr,#32]
	adcs	@a[1],@a[1],@acc[1]
	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
	adcs	@a[2],@a[2],@acc[2]
	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
	adcs	@a[3],@a[3],@acc[3]
	adcs	@a[4],@a[4],@acc[4]
	adcs	@a[5],@a[5],@acc[5]
	adc	$bi,xzr,xzr

	subs	@acc[0],@a[0],@mod[0]
	sbcs	@acc[1],@a[1],@mod[1]
	sbcs	@acc[2],@a[2],@mod[2]
	sbcs	@acc[3],@a[3],@mod[3]
	sbcs	@acc[4],@a[4],@mod[4]
	sbcs	@acc[5],@a[5],@mod[5]
	sbcs	xzr,$bi,xzr

	csel	@a[0],@a[0],@acc[0],lo
	csel	@a[1],@a[1],@acc[1],lo
	csel	@a[2],@a[2],@acc[2],lo
	csel	@a[3],@a[3],@acc[3],lo
	stp	@a[0],@a[1],[$r_ptr,#48]
	csel	@a[4],@a[4],@acc[4],lo
	stp	@a[2],@a[3],[$r_ptr,#64]
	csel	@a[5],@a[5],@acc[5],lo
	stp	@a[4],@a[5],[$r_ptr,#80]

	ret
.size	__add_mod_384x384,.-__add_mod_384x384

.globl	sub_mod_384x384
.type	sub_mod_384x384,%function
.align	5
sub_mod_384x384:
	paciasp
	stp	x29,x30,[sp,#-64]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__sub_mod_384x384
	ldr	x30,[x29,#8]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldr	x29,[sp],#64
	autiasp
	ret
.size	sub_mod_384x384,.-sub_mod_384x384

.type	__sub_mod_384x384,%function
.align	5
__sub_mod_384x384:
	ldp	@a[0],  @a[1],  [$a_ptr]
	ldp	@acc[0],@acc[1],[$b_ptr]
	ldp	@a[2],  @a[3],  [$a_ptr,#16]
	subs	@a[0],@a[0],@acc[0]
	ldp	@acc[2],@acc[3],[$b_ptr,#16]
	sbcs	@a[1],@a[1],@acc[1]
	ldp	@a[4],  @a[5],  [$a_ptr,#32]
	sbcs	@a[2],@a[2],@acc[2]
	ldp	@acc[4],@acc[5],[$b_ptr,#32]
	sbcs	@a[3],@a[3],@acc[3]
	 stp	@a[0],  @a[1],  [$r_ptr]
	sbcs	@a[4],@a[4],@acc[4]
	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
	sbcs	@a[5],@a[5],@acc[5]

	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
	 stp	@a[2],  @a[3],  [$r_ptr,#16]
	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
	 ldp	@acc[2],@acc[3],[$b_ptr,#64]

	sbcs	@a[0],@a[0],@acc[0]
	 stp	@a[4],  @a[5],  [$r_ptr,#32]
	sbcs	@a[1],@a[1],@acc[1]
	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
	sbcs	@a[2],@a[2],@acc[2]
	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
	sbcs	@a[3],@a[3],@acc[3]
	sbcs	@a[4],@a[4],@acc[4]
	sbcs	@a[5],@a[5],@acc[5]
	sbc	$bi,xzr,xzr

	 and	@acc[0],@mod[0],$bi
	 and	@acc[1],@mod[1],$bi
	adds	@a[0],@a[0],@acc[0]
	 and	@acc[2],@mod[2],$bi
	adcs	@a[1],@a[1],@acc[1]
	 and	@acc[3],@mod[3],$bi
	adcs	@a[2],@a[2],@acc[2]
	 and	@acc[4],@mod[4],$bi
	adcs	@a[3],@a[3],@acc[3]
	 and	@acc[5],@mod[5],$bi
	adcs	@a[4],@a[4],@acc[4]
	stp	@a[0],@a[1],[$r_ptr,#48]
	adc	@a[5],@a[5],@acc[5]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ret
.size	__sub_mod_384x384,.-__sub_mod_384x384

.type	__add_mod_384,%function
.align	5
__add_mod_384:
	ldp	@a[0],  @a[1],  [$a_ptr]
	ldp	@acc[0],@acc[1],[$b_ptr]
	ldp	@a[2],  @a[3],  [$a_ptr,#16]
	adds	@a[0],@a[0],@acc[0]
	ldp	@acc[2],@acc[3],[$b_ptr,#16]
	adcs	@a[1],@a[1],@acc[1]
	ldp	@a[4],  @a[5],  [$a_ptr,#32]
	adcs	@a[2],@a[2],@acc[2]
	ldp	@acc[4],@acc[5],[$b_ptr,#32]
	adcs	@a[3],@a[3],@acc[3]
	adcs	@a[4],@a[4],@acc[4]
	adcs	@a[5],@a[5],@acc[5]
	adc	$bi,xzr,xzr

	subs	@acc[0],@a[0],@mod[0]
	sbcs	@acc[1],@a[1],@mod[1]
	sbcs	@acc[2],@a[2],@mod[2]
	sbcs	@acc[3],@a[3],@mod[3]
	sbcs	@acc[4],@a[4],@mod[4]
	sbcs	@acc[5],@a[5],@mod[5]
	sbcs	xzr,$bi,xzr

	csel	@a[0],@a[0],@acc[0],lo
	csel	@a[1],@a[1],@acc[1],lo
	csel	@a[2],@a[2],@acc[2],lo
	csel	@a[3],@a[3],@acc[3],lo
	csel	@a[4],@a[4],@acc[4],lo
	stp	@a[0],@a[1],[$r_ptr]
	csel	@a[5],@a[5],@acc[5],lo
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ret
.size	__add_mod_384,.-__add_mod_384

.type	__sub_mod_384,%function
.align	5
__sub_mod_384:
	ldp	@a[0],  @a[1],  [$a_ptr]
	ldp	@acc[0],@acc[1],[$b_ptr]
	ldp	@a[2],  @a[3],  [$a_ptr,#16]
	subs	@a[0],@a[0],@acc[0]
	ldp	@acc[2],@acc[3],[$b_ptr,#16]
	sbcs	@a[1],@a[1],@acc[1]
	ldp	@a[4],  @a[5],  [$a_ptr,#32]
	sbcs	@a[2],@a[2],@acc[2]
	ldp	@acc[4],@acc[5],[$b_ptr,#32]
	sbcs	@a[3],@a[3],@acc[3]
	sbcs	@a[4],@a[4],@acc[4]
	sbcs	@a[5],@a[5],@acc[5]
	sbc	$bi,xzr,xzr

	 and	@acc[0],@mod[0],$bi
	 and	@acc[1],@mod[1],$bi
	adds	@a[0],@a[0],@acc[0]
	 and	@acc[2],@mod[2],$bi
	adcs	@a[1],@a[1],@acc[1]
	 and	@acc[3],@mod[3],$bi
	adcs	@a[2],@a[2],@acc[2]
	 and	@acc[4],@mod[4],$bi
	adcs	@a[3],@a[3],@acc[3]
	 and	@acc[5],@mod[5],$bi
	adcs	@a[4],@a[4],@acc[4]
	stp	@a[0],@a[1],[$r_ptr]
	adc	@a[5],@a[5],@acc[5]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ret
.size	__sub_mod_384,.-__sub_mod_384

.globl	mul_mont_384x
.hidden	mul_mont_384x
.type	mul_mont_384x,%function
.align	5
mul_mont_384x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	sub	sp,sp,#288		// space for 3 768-bit vectors

	mov	@tmp[0],$r_ptr		// save r_ptr
	mov	@tmp[1],$a_ptr		// save b_ptr
	mov	@tmp[2],$b_ptr		// save b_ptr

	sub	$r_ptr,sp,#0		// mul_384(t0, a->re, b->re)
	bl	__mul_384

	add	$a_ptr,$a_ptr,#48	// mul_384(t1, a->im, b->im)
	add	$b_ptr,$b_ptr,#48
	add	$r_ptr,sp,#96
	bl	__mul_384

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	sub	$b_ptr,$a_ptr,#48
	add	$r_ptr,sp,#240
	bl	__add_mod_384

	add	$a_ptr,@tmp[2],#0
	add	$b_ptr,@tmp[2],#48
	add	$r_ptr,sp,#192		// t2
	bl	__add_mod_384

	add	$a_ptr,$r_ptr,#0
	add	$b_ptr,$r_ptr,#48
	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	mov	$a_ptr,$r_ptr
	add	$b_ptr,sp,#0
	bl	__sub_mod_384x384

	add	$b_ptr,sp,#96
	bl	__sub_mod_384x384	// t2 = t2-t0-t1

	add	$a_ptr,sp,#0
	add	$b_ptr,sp,#96
	add	$r_ptr,sp,#0
	bl	__sub_mod_384x384	// t0 = t0-t1

	add	$a_ptr,sp,#0		// ret->re = redc(t0)
	add	$r_ptr,@tmp[0],#0
	bl	__mul_by_1_mont_384
	bl	__redc_tail_mont_384

	add	$a_ptr,sp,#192		// ret->im = redc(t2)
	add	$r_ptr,$r_ptr,#48
	bl	__mul_by_1_mont_384
	bl	__redc_tail_mont_384
	ldr	x30,[x29,#8]

	add	sp,sp,#288
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	mul_mont_384x,.-mul_mont_384x

.globl	sqr_mont_384x
.hidden	sqr_mont_384x
.type	sqr_mont_384x,%function
.align	5
sqr_mont_384x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
	sub	sp,sp,#96		// space for 2 384-bit vectors
	mov	$n0,$n_ptr		// adjust for missing b_ptr

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	add	$b_ptr,$a_ptr,#48
	add	$r_ptr,sp,#0
	bl	__add_mod_384		// t0 = a->re + a->im

	add	$r_ptr,sp,#48
	bl	__sub_mod_384		// t1 = a->re - a->im

	ldp	@a[0],@a[1],[$a_ptr]
	ldr	$bi,        [$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)

	adds	@a[0],@a[0],@a[0]	// add with itself
	adcs	@a[1],@a[1],@a[1]
	adcs	@a[2],@a[2],@a[2]
	adcs	@a[3],@a[3],@a[3]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	@acc[6],xzr,xzr

	subs	@acc[0],@a[0],@mod[0]
	sbcs	@acc[1],@a[1],@mod[1]
	sbcs	@acc[2],@a[2],@mod[2]
	sbcs	@acc[3],@a[3],@mod[3]
	sbcs	@acc[4],@a[4],@mod[4]
	sbcs	@acc[5],@a[5],@mod[5]
	sbcs	xzr,@acc[6],xzr

	csel	@acc[0],@a[0],@acc[0],lo
	csel	@acc[1],@a[1],@acc[1],lo
	csel	@acc[2],@a[2],@acc[2],lo
	 ldp	@a[0],@a[1],[sp]
	csel	@acc[3],@a[3],@acc[3],lo
	 ldr	$bi,        [sp,#48]
	csel	@acc[4],@a[4],@acc[4],lo
	 ldp	@a[2],@a[3],[sp,#16]
	csel	@acc[5],@a[5],@acc[5],lo
	 ldp	@a[4],@a[5],[sp,#32]

	stp	@acc[0],@acc[1],[$b_ptr,#48]
	stp	@acc[2],@acc[3],[$b_ptr,#64]
	stp	@acc[4],@acc[5],[$b_ptr,#80]

	add	$b_ptr,sp,#48
	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
	ldr	x30,[x29,#8]

	stp	@a[0],@a[1],[$b_ptr]
	stp	@a[2],@a[3],[$b_ptr,#16]
	stp	@a[4],@a[5],[$b_ptr,#32]

	add	sp,sp,#96
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_mont_384x,.-sqr_mont_384x

.globl	mul_mont_384
.hidden	mul_mont_384
.type	mul_mont_384,%function
.align	5
mul_mont_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there

	ldp	@a[0],@a[1],[$a_ptr]
	ldr	$bi,        [$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__mul_mont_384
	ldr	x30,[x29,#8]

	stp	@a[0],@a[1],[$b_ptr]
	stp	@a[2],@a[3],[$b_ptr,#16]
	stp	@a[4],@a[5],[$b_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	mul_mont_384,.-mul_mont_384

.type	__mul_mont_384,%function
.align	5
__mul_mont_384:
	mul	@acc[0],@a[0],$bi
	mul	@acc[1],@a[1],$bi
	mul	@acc[2],@a[2],$bi
	mul	@acc[3],@a[3],$bi
	mul	@acc[4],@a[4],$bi
	mul	@acc[5],@a[5],$bi
	mul	$n0,$n0,@acc[0]

	 umulh	@tmp[0],@a[0],$bi
	 umulh	@tmp[1],@a[1],$bi
	 umulh	@tmp[2],@a[2],$bi
	 umulh	@tmp[3],@a[3],$bi
	 umulh	@tmp[4],@a[4],$bi
	 umulh	@tmp[5],@a[5],$bi

	 adds	@acc[1],@acc[1],@tmp[0]
	// mul	@tmp[0],@mod[0],$n0
	 adcs	@acc[2],@acc[2],@tmp[1]
	mul	@tmp[1],@mod[1],$n0
	 adcs	@acc[3],@acc[3],@tmp[2]
	mul	@tmp[2],@mod[2],$n0
	 adcs	@acc[4],@acc[4],@tmp[3]
	mul	@tmp[3],@mod[3],$n0
	 adcs	@acc[5],@acc[5],@tmp[4]
	mul	@tmp[4],@mod[4],$n0
	 adc	@acc[6],xzr,    @tmp[5]
	mul	@tmp[5],@mod[5],$n0
	 mov	$bi,xzr
___
for ($i=1;$i<6;$i++) {
$code.=<<___;
	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@mod[0],$n0
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@mod[1],$n0
	adcs	@acc[2],@acc[2],@tmp[2]
	 umulh	@tmp[2],@mod[2],$n0
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@mod[3],$n0
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@mod[4],$n0
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@mod[5],$n0
	adcs	@acc[6],@acc[6],xzr
	adc	$n0,$bi,xzr
	ldr	$bi,[$b_ptr,8*$i]

	 adds	@acc[0],@acc[1],@tmp[0]
	mul	@tmp[0],@a[0],$bi
	 adcs	@acc[1],@acc[2],@tmp[1]
	mul	@tmp[1],@a[1],$bi
	 adcs	@acc[2],@acc[3],@tmp[2]
	mul	@tmp[2],@a[2],$bi
	 adcs	@acc[3],@acc[4],@tmp[3]
	mul	@tmp[3],@a[3],$bi
	 adcs	@acc[4],@acc[5],@tmp[4]
	mul	@tmp[4],@a[4],$bi
	 adcs	@acc[5],@acc[6],@tmp[5]
	mul	@tmp[5],@a[5],$bi
	 adc	@acc[6],$n0,xzr
	ldr	$n0,[x29,#96]

	adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@a[0],$bi
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@a[1],$bi
	adcs	@acc[2],@acc[2],@tmp[2]
	mul	$n0,$n0,@acc[0]
	 umulh	@tmp[2],@a[2],$bi
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@a[3],$bi
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@a[4],$bi
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@a[5],$bi
	adcs	@acc[6],@acc[6],xzr
	adc	$bi,xzr,xzr

	 adds	@acc[1],@acc[1],@tmp[0]
	// mul	@tmp[0],@mod[0],$n0
	 adcs	@acc[2],@acc[2],@tmp[1]
	mul	@tmp[1],@mod[1],$n0
	 adcs	@acc[3],@acc[3],@tmp[2]
	mul	@tmp[2],@mod[2],$n0
	 adcs	@acc[4],@acc[4],@tmp[3]
	mul	@tmp[3],@mod[3],$n0
	 adcs	@acc[5],@acc[5],@tmp[4]
	mul	@tmp[4],@mod[4],$n0
	 adcs	@acc[6],@acc[6],@tmp[5]
	mul	@tmp[5],@mod[5],$n0
	 adc	$bi,$bi,xzr
___
}
$code.=<<___;
	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@mod[0],$n0
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@mod[1],$n0
	adcs	@acc[2],@acc[2],@tmp[2]
	 umulh	@tmp[2],@mod[2],$n0
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@mod[3],$n0
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@mod[4],$n0
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@mod[5],$n0
	adcs	@acc[6],@acc[6],xzr
	 ldp	$n0,$b_ptr,[x29,#96]	// pull r_ptr
	adc	$bi,$bi,xzr

	 adds	@acc[0],@acc[1],@tmp[0]
	 adcs	@acc[1],@acc[2],@tmp[1]
	 adcs	@acc[2],@acc[3],@tmp[2]
	 adcs	@acc[3],@acc[4],@tmp[3]
	 adcs	@acc[4],@acc[5],@tmp[4]
	 adcs	@acc[5],@acc[6],@tmp[5]
	 adc	@acc[6],$bi,xzr

	subs	@tmp[0],@acc[0],@mod[0]
	sbcs	@tmp[1],@acc[1],@mod[1]
	sbcs	@tmp[2],@acc[2],@mod[2]
	sbcs	@tmp[3],@acc[3],@mod[3]
	sbcs	@tmp[4],@acc[4],@mod[4]
	sbcs	@tmp[5],@acc[5],@mod[5]
	sbcs	xzr,    @acc[6],xzr

	csel	@a[0],@acc[0],@tmp[0],lo
	csel	@a[1],@acc[1],@tmp[1],lo
	csel	@a[2],@acc[2],@tmp[2],lo
	csel	@a[3],@acc[3],@tmp[3],lo
	csel	@a[4],@acc[4],@tmp[4],lo
	csel	@a[5],@acc[5],@tmp[5],lo
	ret
.size	__mul_mont_384,.-__mul_mont_384

.globl	sqr_mont_384
.hidden	sqr_mont_384
.type	sqr_mont_384,%function
.align	5
sqr_mont_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	sub	sp,sp,#96		// space for 768-bit vector
	mov	$n0,$n_ptr		// adjust for missing b_ptr

	mov	$n_ptr,$r_ptr		// save r_ptr
	mov	$r_ptr,sp

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	bl	__sqr_384

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	mov	$a_ptr,sp
	mov	$r_ptr,$n_ptr		// restore r_ptr
	bl	__mul_by_1_mont_384
	bl	__redc_tail_mont_384
	ldr	x30,[x29,#8]

	add	sp,sp,#96
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_mont_384,.-sqr_mont_384

.globl	sqr_n_mul_mont_383
.hidden	sqr_n_mul_mont_383
.type	sqr_n_mul_mont_383,%function
.align	5
sqr_n_mul_mont_383:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
	sub	sp,sp,#96		// space for 768-bit vector
	mov	$bi,x5			// save b_ptr

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	mov	$r_ptr,sp
.Loop_sqr_383:
	bl	__sqr_384
	sub	$b_ptr,$b_ptr,#1	// counter

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	mov	$a_ptr,sp
	bl	__mul_by_1_mont_384

	ldp	@acc[0],@acc[1],[$a_ptr,#48]
	ldp	@acc[2],@acc[3],[$a_ptr,#64]
	ldp	@acc[4],@acc[5],[$a_ptr,#80]

	adds	@a[0],@a[0],@acc[0]	// just accumulate upper half
	adcs	@a[1],@a[1],@acc[1]
	adcs	@a[2],@a[2],@acc[2]
	adcs	@a[3],@a[3],@acc[3]
	adcs	@a[4],@a[4],@acc[4]
	adc	@a[5],@a[5],@acc[5]

	cbnz	$b_ptr,.Loop_sqr_383

	mov	$b_ptr,$bi
	ldr	$bi,[$bi]
	bl	__mul_mont_384
	ldr	x30,[x29,#8]

	stp	@a[0],@a[1],[$b_ptr]
	stp	@a[2],@a[3],[$b_ptr,#16]
	stp	@a[4],@a[5],[$b_ptr,#32]

	add	sp,sp,#96
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
___
{
my @acc=(@acc,@tmp[0..2]);

$code.=<<___;
.type	__sqr_384,%function
.align	5
__sqr_384:
	mul	@acc[0],@a[1],@a[0]
	mul	@acc[1],@a[2],@a[0]
	mul	@acc[2],@a[3],@a[0]
	mul	@acc[3],@a[4],@a[0]
	mul	@acc[4],@a[5],@a[0]

	 umulh	@mod[1],@a[1],@a[0]
	 umulh	@mod[2],@a[2],@a[0]
	 umulh	@mod[3],@a[3],@a[0]
	 umulh	@mod[4],@a[4],@a[0]
	 adds	@acc[1],@acc[1],@mod[1]
	 umulh	@mod[5],@a[5],@a[0]
	 adcs	@acc[2],@acc[2],@mod[2]
	mul	@mod[2],@a[2],@a[1]
	 adcs	@acc[3],@acc[3],@mod[3]
	mul	@mod[3],@a[3],@a[1]
	 adcs	@acc[4],@acc[4],@mod[4]
	mul	@mod[4],@a[4],@a[1]
	 adc	@acc[5],xzr,    @mod[5]
	mul	@mod[5],@a[5],@a[1]

	adds	@acc[2],@acc[2],@mod[2]
	 umulh	@mod[2],@a[2],@a[1]
	adcs	@acc[3],@acc[3],@mod[3]
	 umulh	@mod[3],@a[3],@a[1]
	adcs	@acc[4],@acc[4],@mod[4]
	 umulh	@mod[4],@a[4],@a[1]
	adcs	@acc[5],@acc[5],@mod[5]
	 umulh	@mod[5],@a[5],@a[1]
	adc	@acc[6],xzr,xzr

	  mul	@mod[0],@a[0],@a[0]
	 adds	@acc[3],@acc[3],@mod[2]
	  umulh	@a[0],  @a[0],@a[0]
	 adcs	@acc[4],@acc[4],@mod[3]
	mul	@mod[3],@a[3],@a[2]
	 adcs	@acc[5],@acc[5],@mod[4]
	mul	@mod[4],@a[4],@a[2]
	 adc	@acc[6],@acc[6],@mod[5]
	mul	@mod[5],@a[5],@a[2]

	adds	@acc[4],@acc[4],@mod[3]
	 umulh	@mod[3],@a[3],@a[2]
	adcs	@acc[5],@acc[5],@mod[4]
	 umulh	@mod[4],@a[4],@a[2]
	adcs	@acc[6],@acc[6],@mod[5]
	 umulh	@mod[5],@a[5],@a[2]
	adc	@acc[7],xzr,xzr

	  mul	@mod[1],@a[1],@a[1]
	 adds	@acc[5],@acc[5],@mod[3]
	  umulh	@a[1],  @a[1],@a[1]
	 adcs	@acc[6],@acc[6],@mod[4]
	mul	@mod[4],@a[4],@a[3]
	 adc	@acc[7],@acc[7],@mod[5]
	mul	@mod[5],@a[5],@a[3]

	adds	@acc[6],@acc[6],@mod[4]
	 umulh	@mod[4],@a[4],@a[3]
	adcs	@acc[7],@acc[7],@mod[5]
	 umulh	@mod[5],@a[5],@a[3]
	adc	@acc[8],xzr,xzr
	  mul	@mod[2],@a[2],@a[2]
	 adds	@acc[7],@acc[7],@mod[4]
	  umulh	@a[2],  @a[2],@a[2]
	 adc	@acc[8],@acc[8],@mod[5]
	  mul	@mod[3],@a[3],@a[3]

	mul	@mod[5],@a[5],@a[4]
	  umulh	@a[3],  @a[3],@a[3]
	adds	@acc[8],@acc[8],@mod[5]
	 umulh	@mod[5],@a[5],@a[4]
	  mul	@mod[4],@a[4],@a[4]
	adc	@acc[9],@mod[5],xzr

	adds	@acc[0],@acc[0],@acc[0]
	adcs	@acc[1],@acc[1],@acc[1]
	adcs	@acc[2],@acc[2],@acc[2]
	adcs	@acc[3],@acc[3],@acc[3]
	adcs	@acc[4],@acc[4],@acc[4]
	adcs	@acc[5],@acc[5],@acc[5]
	adcs	@acc[6],@acc[6],@acc[6]
	adcs	@acc[7],@acc[7],@acc[7]
	  umulh	@a[4],  @a[4],@a[4]
	adcs	@acc[8],@acc[8],@acc[8]
	  mul	@mod[5],@a[5],@a[5]
	adcs	@acc[9],@acc[9],@acc[9]
	  umulh	@a[5],  @a[5],@a[5]
	adc	$a_ptr,xzr,xzr

	adds	@acc[0],@acc[0],@a[0]
	adcs	@acc[1],@acc[1],@mod[1]
	adcs	@acc[2],@acc[2],@a[1]
	adcs	@acc[3],@acc[3],@mod[2]
	adcs	@acc[4],@acc[4],@a[2]
	adcs	@acc[5],@acc[5],@mod[3]
	adcs	@acc[6],@acc[6],@a[3]
	stp	@mod[0],@acc[0],[$r_ptr]
	adcs	@acc[7],@acc[7],@mod[4]
	stp	@acc[1],@acc[2],[$r_ptr,#16]
	adcs	@acc[8],@acc[8],@a[4]
	stp	@acc[3],@acc[4],[$r_ptr,#32]
	adcs	@acc[9],@acc[9],@mod[5]
	stp	@acc[5],@acc[6],[$r_ptr,#48]
	adc	@a[5],@a[5],$a_ptr
	stp	@acc[7],@acc[8],[$r_ptr,#64]
	stp	@acc[9],@a[5],[$r_ptr,#80]

	ret
.size	__sqr_384,.-__sqr_384
___
}
$code.=<<___;
.globl	sqr_384
.hidden	sqr_384
.type	sqr_384,%function
.align	5
sqr_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	bl	__sqr_384
	ldr	x30,[x29,#8]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_384,.-sqr_384

.globl	redc_mont_384
.hidden	redc_mont_384
.type	redc_mont_384,%function
.align	5
redc_mont_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	mov	$n0,$n_ptr		// adjust for missing b_ptr

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__mul_by_1_mont_384
	bl	__redc_tail_mont_384
	ldr	x30,[x29,#8]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	redc_mont_384,.-redc_mont_384

.globl	from_mont_384
.hidden	from_mont_384
.type	from_mont_384,%function
.align	5
from_mont_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	mov	$n0,$n_ptr		// adjust for missing b_ptr

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__mul_by_1_mont_384
	ldr	x30,[x29,#8]

	subs	@acc[0],@a[0],@mod[0]
	sbcs	@acc[1],@a[1],@mod[1]
	sbcs	@acc[2],@a[2],@mod[2]
	sbcs	@acc[3],@a[3],@mod[3]
	sbcs	@acc[4],@a[4],@mod[4]
	sbcs	@acc[5],@a[5],@mod[5]

	csel	@a[0],@a[0],@acc[0],lo
	csel	@a[1],@a[1],@acc[1],lo
	csel	@a[2],@a[2],@acc[2],lo
	csel	@a[3],@a[3],@acc[3],lo
	csel	@a[4],@a[4],@acc[4],lo
	csel	@a[5],@a[5],@acc[5],lo

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	from_mont_384,.-from_mont_384

.type	__mul_by_1_mont_384,%function
.align	5
__mul_by_1_mont_384:
	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	mul	@tmp[0],$n0,@a[0]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	// mul	@acc[0],@mod[0],@tmp[0]
	mul	@acc[1],@mod[1],@tmp[0]
	mul	@acc[2],@mod[2],@tmp[0]
	mul	@acc[3],@mod[3],@tmp[0]
	mul	@acc[4],@mod[4],@tmp[0]
	mul	@acc[5],@mod[5],@tmp[0]
	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
	 umulh	@a[0],@mod[0],@tmp[0]
	adcs	@acc[1],@acc[1],@a[1]
	 umulh	@a[1],@mod[1],@tmp[0]
	adcs	@acc[2],@acc[2],@a[2]
	 umulh	@a[2],@mod[2],@tmp[0]
	adcs	@acc[3],@acc[3],@a[3]
	 umulh	@a[3],@mod[3],@tmp[0]
	adcs	@acc[4],@acc[4],@a[4]
	 umulh	@a[4],@mod[4],@tmp[0]
	adcs	@acc[5],@acc[5],@a[5]
	 umulh	@a[5],@mod[5],@tmp[0]
	adc	@acc[6],xzr,xzr
___
for ($i=1;$i<6;$i++) {
$code.=<<___;
	 adds	@a[0],@a[0],@acc[1]
	 adcs	@a[1],@a[1],@acc[2]
	 adcs	@a[2],@a[2],@acc[3]
	mul	@tmp[0],$n0,@a[0]
	 adcs	@a[3],@a[3],@acc[4]
	 adcs	@a[4],@a[4],@acc[5]
	 adc	@a[5],@a[5],@acc[6]

	// mul	@acc[0],@mod[0],@tmp[0]
	mul	@acc[1],@mod[1],@tmp[0]
	mul	@acc[2],@mod[2],@tmp[0]
	mul	@acc[3],@mod[3],@tmp[0]
	mul	@acc[4],@mod[4],@tmp[0]
	mul	@acc[5],@mod[5],@tmp[0]
	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
	 umulh	@a[0],@mod[0],@tmp[0]
	adcs	@acc[1],@acc[1],@a[1]
	 umulh	@a[1],@mod[1],@tmp[0]
	adcs	@acc[2],@acc[2],@a[2]
	 umulh	@a[2],@mod[2],@tmp[0]
	adcs	@acc[3],@acc[3],@a[3]
	 umulh	@a[3],@mod[3],@tmp[0]
	adcs	@acc[4],@acc[4],@a[4]
	 umulh	@a[4],@mod[4],@tmp[0]
	adcs	@acc[5],@acc[5],@a[5]
	 umulh	@a[5],@mod[5],@tmp[0]
	adc	@acc[6],xzr,xzr
___
}
$code.=<<___;
	adds	@a[0],@a[0],@acc[1]
	adcs	@a[1],@a[1],@acc[2]
	adcs	@a[2],@a[2],@acc[3]
	adcs	@a[3],@a[3],@acc[4]
	adcs	@a[4],@a[4],@acc[5]
	adc	@a[5],@a[5],@acc[6]

	ret
.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384

.type	__redc_tail_mont_384,%function
.align	5
__redc_tail_mont_384:
	ldp	@acc[0],@acc[1],[$a_ptr,#48]
	ldp	@acc[2],@acc[3],[$a_ptr,#64]
	ldp	@acc[4],@acc[5],[$a_ptr,#80]

	adds	@a[0],@a[0],@acc[0]	// accumulate upper half
	adcs	@a[1],@a[1],@acc[1]
	adcs	@a[2],@a[2],@acc[2]
	adcs	@a[3],@a[3],@acc[3]
	adcs	@a[4],@a[4],@acc[4]
	adcs	@a[5],@a[5],@acc[5]
	adc	@acc[6],xzr,xzr

	subs	@acc[0],@a[0],@mod[0]
	sbcs	@acc[1],@a[1],@mod[1]
	sbcs	@acc[2],@a[2],@mod[2]
	sbcs	@acc[3],@a[3],@mod[3]
	sbcs	@acc[4],@a[4],@mod[4]
	sbcs	@acc[5],@a[5],@mod[5]
	sbcs	xzr,@acc[6],xzr

	csel	@a[0],@a[0],@acc[0],lo
	csel	@a[1],@a[1],@acc[1],lo
	csel	@a[2],@a[2],@acc[2],lo
	csel	@a[3],@a[3],@acc[3],lo
	csel	@a[4],@a[4],@acc[4],lo
	csel	@a[5],@a[5],@acc[5],lo

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ret
.size	__redc_tail_mont_384,.-__redc_tail_mont_384

.globl	mul_384
.hidden	mul_384
.type	mul_384,%function
.align	5
mul_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

	bl	__mul_384
	ldr	x30,[x29,#8]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	mul_384,.-mul_384

.type	__mul_384,%function
.align	5
__mul_384:
	ldp	@a[0],@a[1],[$a_ptr]
	ldr	$bi,        [$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	mul	@acc[0],@a[0],$bi
	mul	@acc[1],@a[1],$bi
	mul	@acc[2],@a[2],$bi
	mul	@acc[3],@a[3],$bi
	mul	@acc[4],@a[4],$bi
	mul	@acc[5],@a[5],$bi

	 umulh	@mod[0],@a[0],$bi
	 umulh	@mod[1],@a[1],$bi
	 umulh	@mod[2],@a[2],$bi
	 umulh	@mod[3],@a[3],$bi
	 umulh	@mod[4],@a[4],$bi
	 umulh	@mod[5],@a[5],$bi
	ldr	$bi,[$b_ptr,8*1]

	str	@acc[0],[$r_ptr]
	 adds	@acc[0],@acc[1],@mod[0]
	mul	@mod[0],@a[0],$bi
	 adcs	@acc[1],@acc[2],@mod[1]
	mul	@mod[1],@a[1],$bi
	 adcs	@acc[2],@acc[3],@mod[2]
	mul	@mod[2],@a[2],$bi
	 adcs	@acc[3],@acc[4],@mod[3]
	mul	@mod[3],@a[3],$bi
	 adcs	@acc[4],@acc[5],@mod[4]
	mul	@mod[4],@a[4],$bi
	 adc	@acc[5],xzr,    @mod[5]
	mul	@mod[5],@a[5],$bi
___
for ($i=1;$i<5;$i++) {
$code.=<<___;
	adds	@acc[0],@acc[0],@mod[0]
	 umulh	@mod[0],@a[0],$bi
	adcs	@acc[1],@acc[1],@mod[1]
	 umulh	@mod[1],@a[1],$bi
	adcs	@acc[2],@acc[2],@mod[2]
	 umulh	@mod[2],@a[2],$bi
	adcs	@acc[3],@acc[3],@mod[3]
	 umulh	@mod[3],@a[3],$bi
	adcs	@acc[4],@acc[4],@mod[4]
	 umulh	@mod[4],@a[4],$bi
	adcs	@acc[5],@acc[5],@mod[5]
	 umulh	@mod[5],@a[5],$bi
	ldr	$bi,[$b_ptr,#8*($i+1)]
	adc	@acc[6],xzr,xzr

	str	@acc[0],[$r_ptr,8*$i]
	 adds	@acc[0],@acc[1],@mod[0]
	mul	@mod[0],@a[0],$bi
	 adcs	@acc[1],@acc[2],@mod[1]
	mul	@mod[1],@a[1],$bi
	 adcs	@acc[2],@acc[3],@mod[2]
	mul	@mod[2],@a[2],$bi
	 adcs	@acc[3],@acc[4],@mod[3]
	mul	@mod[3],@a[3],$bi
	 adcs	@acc[4],@acc[5],@mod[4]
	mul	@mod[4],@a[4],$bi
	 adc	@acc[5],@acc[6],@mod[5]
	mul	@mod[5],@a[5],$bi
___
}
$code.=<<___;
	adds	@acc[0],@acc[0],@mod[0]
	 umulh	@mod[0],@a[0],$bi
	adcs	@acc[1],@acc[1],@mod[1]
	 umulh	@mod[1],@a[1],$bi
	adcs	@acc[2],@acc[2],@mod[2]
	 umulh	@mod[2],@a[2],$bi
	adcs	@acc[3],@acc[3],@mod[3]
	 umulh	@mod[3],@a[3],$bi
	adcs	@acc[4],@acc[4],@mod[4]
	 umulh	@mod[4],@a[4],$bi
	adcs	@acc[5],@acc[5],@mod[5]
	 umulh	@mod[5],@a[5],$bi
	adc	@acc[6],xzr,xzr

	str	@acc[0],[$r_ptr,8*$i]
	 adds	@acc[0],@acc[1],@mod[0]
	 adcs	@acc[1],@acc[2],@mod[1]
	 adcs	@acc[2],@acc[3],@mod[2]
	 adcs	@acc[3],@acc[4],@mod[3]
	 adcs	@acc[4],@acc[5],@mod[4]
	 adc	@acc[5],@acc[6],@mod[5]

	stp	@acc[0],@acc[1],[$r_ptr,#48]
	stp	@acc[2],@acc[3],[$r_ptr,#64]
	stp	@acc[4],@acc[5],[$r_ptr,#80]

	ret
.size	__mul_384,.-__mul_384

.globl	mul_382x
.hidden	mul_382x
.type	mul_382x,%function
.align	5
mul_382x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	sub	sp,sp,#96		// space for two 384-bit vectors

	ldp	@a[0],@a[1],[$a_ptr]
	mov	@tmp[0],$r_ptr		// save r_ptr
	ldp	@acc[0],@acc[1],[$a_ptr,#48]
	mov	@tmp[1],$a_ptr		// save a_ptr
	ldp	@a[2],@a[3],[$a_ptr,#16]
	mov	@tmp[2],$b_ptr		// save b_ptr
	ldp	@acc[2],@acc[3],[$a_ptr,#64]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
	ldp	@acc[4],@acc[5],[$a_ptr,#80]
	adcs	@mod[1],$a[1],@acc[1]
	 ldp	@a[0],@a[1],[$b_ptr]
	adcs	@mod[2],$a[2],@acc[2]
	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
	adcs	@mod[3],$a[3],@acc[3]
	 ldp	@a[2],@a[3],[$b_ptr,#16]
	adcs	@mod[4],$a[4],@acc[4]
	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
	adc	@mod[5],$a[5],@acc[5]
	 ldp	@a[4],@a[5],[$b_ptr,#32]

	stp	@mod[0],@mod[1],[sp]
	 adds	@mod[0],$a[0],@acc[0]	// t1 = b->re + b->im
	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
	 adcs	@mod[1],$a[1],@acc[1]
	stp	@mod[2],@mod[3],[sp,#16]
	 adcs	@mod[2],$a[2],@acc[2]
	 adcs	@mod[3],$a[3],@acc[3]
	 stp	@mod[4],@mod[5],[sp,#32]
	 adcs	@mod[4],$a[4],@acc[4]
	 stp	@mod[0],@mod[1],[sp,#48]
	 adc	@mod[5],$a[5],@acc[5]
	 stp	@mod[2],@mod[3],[sp,#64]
	 stp	@mod[4],@mod[5],[sp,#80]

	bl	__mul_384		// mul_384(ret->re, a->re, b->re)

	add	$a_ptr,sp,#0		// mul_384(ret->im, t0, t1)
	add	$b_ptr,sp,#48
	add	$r_ptr,@tmp[0],#96
	bl	__mul_384

	add	$a_ptr,@tmp[1],#48	// mul_384(tx, a->im, b->im)
	add	$b_ptr,@tmp[2],#48
	add	$r_ptr,sp,#0
	bl	__mul_384

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	add	$a_ptr,@tmp[0],#96	// ret->im -= tx
	add	$b_ptr,sp,#0
	add	$r_ptr,@tmp[0],#96
	bl	__sub_mod_384x384

	add	$b_ptr,@tmp[0],#0	// ret->im -= ret->re
	bl	__sub_mod_384x384

	add	$a_ptr,@tmp[0],#0	// ret->re -= tx
	add	$b_ptr,sp,#0
	add	$r_ptr,@tmp[0],#0
	bl	__sub_mod_384x384
	ldr	x30,[x29,#8]

	add	sp,sp,#96
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	mul_382x,.-mul_382x

.globl	sqr_382x
.hidden	sqr_382x
.type	sqr_382x,%function
.align	5
sqr_382x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@acc[0],@acc[1],[$a_ptr,#48]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
	ldp	@acc[2],@acc[3],[$a_ptr,#64]
	adcs	@mod[1],$a[1],@acc[1]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	adcs	@mod[2],$a[2],@acc[2]
	ldp	@acc[4],@acc[5],[$a_ptr,#80]
	adcs	@mod[3],$a[3],@acc[3]
	stp	@mod[0],@mod[1],[$r_ptr]
	adcs	@mod[4],$a[4],@acc[4]
	 ldp	@mod[0],@mod[1],[$b_ptr]
	adc	@mod[5],$a[5],@acc[5]
	stp	@mod[2],@mod[3],[$r_ptr,#16]

	subs	@a[0],$a[0],@acc[0]	// t1 = a->re - a->im
	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
	sbcs	@a[1],$a[1],@acc[1]
	stp	@mod[4],@mod[5],[$r_ptr,#32]
	sbcs	@a[2],$a[2],@acc[2]
	 ldp	@mod[4],@mod[5],[$b_ptr,#32]
	sbcs	@a[3],$a[3],@acc[3]
	sbcs	@a[4],$a[4],@acc[4]
	sbcs	@a[5],$a[5],@acc[5]
	sbc	@acc[6],xzr,xzr

	 and	@acc[0],@mod[0],@acc[6]
	 and	@acc[1],@mod[1],@acc[6]
	adds	@a[0],@a[0],@acc[0]
	 and	@acc[2],@mod[2],@acc[6]
	adcs	@a[1],@a[1],@acc[1]
	 and	@acc[3],@mod[3],@acc[6]
	adcs	@a[2],@a[2],@acc[2]
	 and	@acc[4],@mod[4],@acc[6]
	adcs	@a[3],@a[3],@acc[3]
	 and	@acc[5],@mod[5],@acc[6]
	adcs	@a[4],@a[4],@acc[4]
	stp	@a[0],@a[1],[$r_ptr,#48]
	adc	@a[5],@a[5],@acc[5]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	mov	$n0,$a_ptr		// save a_ptr
	add	$a_ptr,$r_ptr,#0	// mul_384(ret->re, t0, t1)
	add	$b_ptr,$r_ptr,#48
	bl	__mul_384

	add	$a_ptr,$n0,#0		// mul_384(ret->im, a->re, a->im)
	add	$b_ptr,$n0,#48
	add	$r_ptr,$r_ptr,#96
	bl	__mul_384
	ldr	x30,[x29,#8]

	ldp	@a[0],@a[1],[$r_ptr]
	ldp	@a[2],@a[3],[$r_ptr,#16]
	adds	@a[0],@a[0],@a[0]	// add with itself
	ldp	@a[4],@a[5],[$r_ptr,#32]
	adcs	@a[1],@a[1],@a[1]
	adcs	@a[2],@a[2],@a[2]
	adcs	@a[3],@a[3],@a[3]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adcs	@acc[0],@acc[0],@acc[0]
	adcs	@acc[1],@acc[1],@acc[1]
	stp	@a[0],@a[1],[$r_ptr]
	adcs	@acc[2],@acc[2],@acc[2]
	stp	@a[2],@a[3],[$r_ptr,#16]
	adcs	@acc[3],@acc[3],@acc[3]
	stp	@a[4],@a[5],[$r_ptr,#32]
	adcs	@acc[4],@acc[4],@acc[4]
	stp	@acc[0],@acc[1],[$r_ptr,#48]
	adc	@acc[5],@acc[5],@acc[5]
	stp	@acc[2],@acc[3],[$r_ptr,#64]
	stp	@acc[4],@acc[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_382x,.-sqr_382x

.globl	sqr_mont_382x
.hidden	sqr_mont_382x
.type	sqr_mont_382x,%function
.align	5
sqr_mont_382x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
	sub	sp,sp,#112		// space for two 384-bit vectors + word
	mov	$n0,$n_ptr		// adjust for missing b_ptr

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	$bi,@acc[1],[$a_ptr,#48]
	ldp	@acc[2],@acc[3],[$a_ptr,#64]
	ldp	@acc[4],@acc[5],[$a_ptr,#80]

	adds	@mod[0],$a[0],$bi	// t0 = a->re + a->im
	adcs	@mod[1],$a[1],@acc[1]
	adcs	@mod[2],$a[2],@acc[2]
	adcs	@mod[3],$a[3],@acc[3]
	adcs	@mod[4],$a[4],@acc[4]
	adc	@mod[5],$a[5],@acc[5]

	subs	@acc[0],$a[0],$bi	// t1 = a->re - a->im
	sbcs	@acc[1],$a[1],@acc[1]
	sbcs	@acc[2],$a[2],@acc[2]
	sbcs	@acc[3],$a[3],@acc[3]
	sbcs	@acc[4],$a[4],@acc[4]
	sbcs	@acc[5],$a[5],@acc[5]
	sbc	@acc[6],xzr,xzr		// borrow flag as mask

	stp	@mod[0],@mod[1],[sp]
	stp	@mod[2],@mod[3],[sp,#16]
	stp	@mod[4],@mod[5],[sp,#32]
	stp	@acc[0],@acc[1],[sp,#48]
	stp	@acc[2],@acc[3],[sp,#64]
	stp	@acc[4],@acc[5],[sp,#80]
	str	@acc[6],[sp,#96]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	add	$b_ptr,$a_ptr,#48
	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)

	adds	@acc[0],@a[0],@a[0]	// add with itself
	adcs	@acc[1],@a[1],@a[1]
	adcs	@acc[2],@a[2],@a[2]
	adcs	@acc[3],@a[3],@a[3]
	adcs	@acc[4],@a[4],@a[4]
	adc	@acc[5],@a[5],@a[5]

	stp	@acc[0],@acc[1],[$b_ptr,#48]
	stp	@acc[2],@acc[3],[$b_ptr,#64]
	stp	@acc[4],@acc[5],[$b_ptr,#80]

	ldp	@a[0],@a[1],[sp]
	ldr	$bi,[sp,#48]
	ldp	@a[2],@a[3],[sp,#16]
	ldp	@a[4],@a[5],[sp,#32]

	add	$b_ptr,sp,#48
	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
	ldr	x30,[x29,#8]

	ldr	@acc[6],[sp,#96]	// account for sign from a->re - a->im
	ldp	@acc[0],@acc[1],[sp]
	ldp	@acc[2],@acc[3],[sp,#16]
	ldp	@acc[4],@acc[5],[sp,#32]

	and	@acc[0],@acc[0],@acc[6]
	and	@acc[1],@acc[1],@acc[6]
	and	@acc[2],@acc[2],@acc[6]
	and	@acc[3],@acc[3],@acc[6]
	and	@acc[4],@acc[4],@acc[6]
	and	@acc[5],@acc[5],@acc[6]

	subs	@a[0],@a[0],@acc[0]
	sbcs	@a[1],@a[1],@acc[1]
	sbcs	@a[2],@a[2],@acc[2]
	sbcs	@a[3],@a[3],@acc[3]
	sbcs	@a[4],@a[4],@acc[4]
	sbcs	@a[5],@a[5],@acc[5]
	sbc	@acc[6],xzr,xzr

	and	@acc[0],@mod[0],@acc[6]
	and	@acc[1],@mod[1],@acc[6]
	and	@acc[2],@mod[2],@acc[6]
	and	@acc[3],@mod[3],@acc[6]
	and	@acc[4],@mod[4],@acc[6]
	and	@acc[5],@mod[5],@acc[6]

	adds	@a[0],@a[0],@acc[0]
	adcs	@a[1],@a[1],@acc[1]
	adcs	@a[2],@a[2],@acc[2]
	adcs	@a[3],@a[3],@acc[3]
	adcs	@a[4],@a[4],@acc[4]
	adc	@a[5],@a[5],@acc[5]

	stp	@a[0],@a[1],[$b_ptr]
	stp	@a[2],@a[3],[$b_ptr,#16]
	stp	@a[4],@a[5],[$b_ptr,#32]

	add	sp,sp,#112
	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sqr_mont_382x,.-sqr_mont_382x

.type	__mul_mont_383_nonred,%function
.align	5
__mul_mont_383_nonred:
	mul	@acc[0],@a[0],$bi
	mul	@acc[1],@a[1],$bi
	mul	@acc[2],@a[2],$bi
	mul	@acc[3],@a[3],$bi
	mul	@acc[4],@a[4],$bi
	mul	@acc[5],@a[5],$bi
	mul	$n0,$n0,@acc[0]

	 umulh	@tmp[0],@a[0],$bi
	 umulh	@tmp[1],@a[1],$bi
	 umulh	@tmp[2],@a[2],$bi
	 umulh	@tmp[3],@a[3],$bi
	 umulh	@tmp[4],@a[4],$bi
	 umulh	@tmp[5],@a[5],$bi

	 adds	@acc[1],@acc[1],@tmp[0]
	mul	@tmp[0],@mod[0],$n0
	 adcs	@acc[2],@acc[2],@tmp[1]
	mul	@tmp[1],@mod[1],$n0
	 adcs	@acc[3],@acc[3],@tmp[2]
	mul	@tmp[2],@mod[2],$n0
	 adcs	@acc[4],@acc[4],@tmp[3]
	mul	@tmp[3],@mod[3],$n0
	 adcs	@acc[5],@acc[5],@tmp[4]
	mul	@tmp[4],@mod[4],$n0
	 adc	@acc[6],xzr,    @tmp[5]
	mul	@tmp[5],@mod[5],$n0
___
for ($i=1;$i<6;$i++) {
$code.=<<___;
	ldr	$bi,[$b_ptr,8*$i]
	adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@mod[0],$n0
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@mod[1],$n0
	adcs	@acc[2],@acc[2],@tmp[2]
	 umulh	@tmp[2],@mod[2],$n0
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@mod[3],$n0
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@mod[4],$n0
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@mod[5],$n0
	adc	@acc[6],@acc[6],xzr

	ldr	$n0,[x29,#96]
	 adds	@acc[0],@acc[1],@tmp[0]
	mul	@tmp[0],@a[0],$bi
	 adcs	@acc[1],@acc[2],@tmp[1]
	mul	@tmp[1],@a[1],$bi
	 adcs	@acc[2],@acc[3],@tmp[2]
	mul	@tmp[2],@a[2],$bi
	 adcs	@acc[3],@acc[4],@tmp[3]
	mul	@tmp[3],@a[3],$bi
	 adcs	@acc[4],@acc[5],@tmp[4]
	mul	@tmp[4],@a[4],$bi
	 adcs	@acc[5],@acc[6],@tmp[5]
	mul	@tmp[5],@a[5],$bi
	 adc	@acc[6],xzr,xzr

	adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@a[0],$bi
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@a[1],$bi
	adcs	@acc[2],@acc[2],@tmp[2]
	mul	$n0,$n0,@acc[0]
	 umulh	@tmp[2],@a[2],$bi
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@a[3],$bi
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@a[4],$bi
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@a[5],$bi
	adc	@acc[6],@acc[6],xzr

	 adds	@acc[1],@acc[1],@tmp[0]
	mul	@tmp[0],@mod[0],$n0
	 adcs	@acc[2],@acc[2],@tmp[1]
	mul	@tmp[1],@mod[1],$n0
	 adcs	@acc[3],@acc[3],@tmp[2]
	mul	@tmp[2],@mod[2],$n0
	 adcs	@acc[4],@acc[4],@tmp[3]
	mul	@tmp[3],@mod[3],$n0
	 adcs	@acc[5],@acc[5],@tmp[4]
	mul	@tmp[4],@mod[4],$n0
	 adc	@acc[6],@acc[6],@tmp[5]
	mul	@tmp[5],@mod[5],$n0
___
}
$code.=<<___;
	adds	@acc[0],@acc[0],@tmp[0]
	 umulh	@tmp[0],@mod[0],$n0
	adcs	@acc[1],@acc[1],@tmp[1]
	 umulh	@tmp[1],@mod[1],$n0
	adcs	@acc[2],@acc[2],@tmp[2]
	 umulh	@tmp[2],@mod[2],$n0
	adcs	@acc[3],@acc[3],@tmp[3]
	 umulh	@tmp[3],@mod[3],$n0
	adcs	@acc[4],@acc[4],@tmp[4]
	 umulh	@tmp[4],@mod[4],$n0
	adcs	@acc[5],@acc[5],@tmp[5]
	 umulh	@tmp[5],@mod[5],$n0
	adc	@acc[6],@acc[6],xzr
	 ldp	$n0,$b_ptr,[x29,#96]		// pull r_ptr

	 adds	@a[0],@acc[1],@tmp[0]
	 adcs	@a[1],@acc[2],@tmp[1]
	 adcs	@a[2],@acc[3],@tmp[2]
	 adcs	@a[3],@acc[4],@tmp[3]
	 adcs	@a[4],@acc[5],@tmp[4]
	 adcs	@a[5],@acc[6],@tmp[5]

	ret
.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred

.globl	sgn0_pty_mont_384
.hidden	sgn0_pty_mont_384
.type	sgn0_pty_mont_384,%function
.align	5
sgn0_pty_mont_384:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

	mov	$n0,$b_ptr
	ldp	@mod[0],@mod[1],[$a_ptr]
	ldp	@mod[2],@mod[3],[$a_ptr,#16]
	ldp	@mod[4],@mod[5],[$a_ptr,#32]
	mov	$a_ptr,$r_ptr

	bl	__mul_by_1_mont_384
	ldr	x30,[x29,#8]

	and	$r_ptr,@a[0],#1
	adds	@a[0],@a[0],@a[0]
	adcs	@a[1],@a[1],@a[1]
	adcs	@a[2],@a[2],@a[2]
	adcs	@a[3],@a[3],@a[3]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	$bi,xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	$bi,$bi,xzr

	mvn	$bi,$bi
	and	$bi,$bi,#2
	orr	$r_ptr,$r_ptr,$bi

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384

.globl	sgn0_pty_mont_384x
.hidden	sgn0_pty_mont_384x
.type	sgn0_pty_mont_384x,%function
.align	5
sgn0_pty_mont_384x:
	paciasp
	stp	x29,x30,[sp,#-128]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

	mov	$n0,$b_ptr
	ldp	@mod[0],@mod[1],[$a_ptr]
	ldp	@mod[2],@mod[3],[$a_ptr,#16]
	ldp	@mod[4],@mod[5],[$a_ptr,#32]
	mov	$a_ptr,$r_ptr

	bl	__mul_by_1_mont_384
	add	$a_ptr,$a_ptr,#48

	and	$b_ptr,@a[0],#1
	 orr	$n_ptr,@a[0],@a[1]
	adds	@a[0],@a[0],@a[0]
	 orr	$n_ptr,$n_ptr,@a[2]
	adcs	@a[1],@a[1],@a[1]
	 orr	$n_ptr,$n_ptr,@a[3]
	adcs	@a[2],@a[2],@a[2]
	 orr	$n_ptr,$n_ptr,@a[4]
	adcs	@a[3],@a[3],@a[3]
	 orr	$n_ptr,$n_ptr,@a[5]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	$bi,xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	$bi,$bi,xzr

	mvn	$bi,$bi
	and	$bi,$bi,#2
	orr	$b_ptr,$b_ptr,$bi

	bl	__mul_by_1_mont_384
	ldr	x30,[x29,#8]

	and	$r_ptr,@a[0],#1
	 orr	$a_ptr,@a[0],@a[1]
	adds	@a[0],@a[0],@a[0]
	 orr	$a_ptr,$a_ptr,@a[2]
	adcs	@a[1],@a[1],@a[1]
	 orr	$a_ptr,$a_ptr,@a[3]
	adcs	@a[2],@a[2],@a[2]
	 orr	$a_ptr,$a_ptr,@a[4]
	adcs	@a[3],@a[3],@a[3]
	 orr	$a_ptr,$a_ptr,@a[5]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	$bi,xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	$bi,$bi,xzr

	mvn	$bi,$bi
	and	$bi,$bi,#2
	orr	$r_ptr,$r_ptr,$bi

	cmp	$n_ptr,#0
	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)

	cmp	$a_ptr,#0
	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)

	and	$n_ptr,$n_ptr,#1
	and	$a_ptr,$a_ptr,#2
	orr	$r_ptr,$a_ptr,$n_ptr		// pack sign and parity

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldr	x29,[sp],#128
	autiasp
	ret
.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
___

if (0) {
my @b = ($bi, @mod[0..4]);
my @comba = @acc[4..6];

$code.=<<___;
.type	__mul_384_comba,%function
.align	5
__mul_384_comba:
	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@b[0],@b[1],[$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	ldp	@b[2],@b[3],[$b_ptr,#16]
	ldp	@b[4],@b[5],[$b_ptr,#32]

	mul	@comba[0],@a[0],@b[0]
	umulh	@comba[1],@a[0],@b[0]
	 mul	@acc[0],@a[1],@b[0]
	 umulh	@acc[1],@a[1],@b[0]
	str	@comba[0],[$r_ptr]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[2],@a[0],@b[1]
	umulh	@acc[3],@a[0],@b[1]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],xzr,      @acc[1]
	adc	@comba[2],xzr,xzr
	mul	@acc[0],@a[2],@b[0]
	umulh	@acc[1],@a[2],@b[0]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#8]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[2],@a[1],@b[1]
	umulh	@acc[3],@a[1],@b[1]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],xzr,xzr
	mul	@acc[0],@a[0],@b[2]
	umulh	@acc[1],@a[0],@b[2]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[2],@a[3],@b[0]
	 umulh	@acc[3],@a[3],@b[0]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#16]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[0],@a[2],@b[1]
	umulh	@acc[1],@a[2],@b[1]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],xzr,xzr
	mul	@acc[2],@a[1],@b[2]
	umulh	@acc[3],@a[1],@b[2]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[0],@a[0],@b[3]
	umulh	@acc[1],@a[0],@b[3]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[2],@a[4],@b[0]
	 umulh	@acc[3],@a[4],@b[0]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#24]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[0],@a[3],@b[1]
	umulh	@acc[1],@a[3],@b[1]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],xzr,xzr
	mul	@acc[2],@a[2],@b[2]
	umulh	@acc[3],@a[2],@b[2]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[0],@a[1],@b[3]
	umulh	@acc[1],@a[1],@b[3]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[2],@a[0],@b[4]
	umulh	@acc[3],@a[0],@b[4]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[0],@a[5],@b[0]
	 umulh	@acc[1],@a[5],@b[0]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#32]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[2],@a[4],@b[1]
	umulh	@acc[3],@a[4],@b[1]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],xzr,xzr
	mul	@acc[0],@a[3],@b[2]
	umulh	@acc[1],@a[3],@b[2]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[2],@a[2],@b[3]
	umulh	@acc[3],@a[2],@b[3]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[0],@a[1],@b[4]
	umulh	@acc[1],@a[1],@b[4]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[2],@a[0],@b[5]
	umulh	@acc[3],@a[0],@b[5]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[0],@a[5],@b[1]
	 umulh	@acc[1],@a[5],@b[1]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#40]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[2],@a[4],@b[2]
	umulh	@acc[3],@a[4],@b[2]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],xzr,xzr
	mul	@acc[0],@a[3],@b[3]
	umulh	@acc[1],@a[3],@b[3]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[2],@a[2],@b[4]
	umulh	@acc[3],@a[2],@b[4]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[0],@a[1],@b[5]
	umulh	@acc[1],@a[1],@b[5]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[2],@a[5],@b[2]
	 umulh	@acc[3],@a[5],@b[2]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#48]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[0],@a[4],@b[3]
	umulh	@acc[1],@a[4],@b[3]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],xzr,xzr
	mul	@acc[2],@a[3],@b[4]
	umulh	@acc[3],@a[3],@b[4]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	mul	@acc[0],@a[2],@b[5]
	umulh	@acc[1],@a[2],@b[5]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[2],@a[5],@b[3]
	 umulh	@acc[3],@a[5],@b[3]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#56]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[0],@a[4],@b[4]
	umulh	@acc[1],@a[4],@b[4]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],xzr,xzr
	mul	@acc[2],@a[3],@b[5]
	umulh	@acc[3],@a[3],@b[5]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],@comba[2],xzr
	 mul	@acc[0],@a[5],@b[4]
	 umulh	@acc[1],@a[5],@b[4]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#64]
___
	push(@comba,shift(@comba));
$code.=<<___;
	mul	@acc[2],@a[4],@b[5]
	umulh	@acc[3],@a[4],@b[5]
	adds	@comba[0],@comba[0],@acc[0]
	adcs	@comba[1],@comba[1],@acc[1]
	adc	@comba[2],xzr,xzr
	 mul	@acc[0],@a[5],@b[5]
	 umulh	@acc[1],@a[5],@b[5]
	adds	@comba[0],@comba[0],@acc[2]
	adcs	@comba[1],@comba[1],@acc[3]
	adc	@comba[2],@comba[2],xzr
	str	@comba[0],[$r_ptr,#72]
___
	push(@comba,shift(@comba));
$code.=<<___;
	adds	@comba[0],@comba[0],@acc[0]
	adc	@comba[1],@comba[1],@acc[1]
	stp	@comba[0],@comba[1],[$r_ptr,#80]

	ret
.size	__mul_384_comba,.-__mul_384_comba
___
}
print $code;

close STDOUT;
