#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);

@mod=map("x$_",(4..9));
@a=map("x$_",(10..15));
@b=map("x$_",(16,17,19..22));
$carry=$n_ptr;

$code.=<<___;
.text

.globl	add_mod_384
.hidden	add_mod_384
.type	add_mod_384,%function
.align	5
add_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__add_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	add_mod_384,.-add_mod_384

.type	__add_mod_384,%function
.align	5
__add_mod_384:
	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@b[0],@b[1],[$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@b[2],@b[3],[$b_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	ldp	@b[4],@b[5],[$b_ptr,#32]

__add_mod_384_ab_are_loaded:
	adds	@a[0],@a[0],@b[0]
	adcs	@a[1],@a[1],@b[1]
	adcs	@a[2],@a[2],@b[2]
	adcs	@a[3],@a[3],@b[3]
	adcs	@a[4],@a[4],@b[4]
	adcs	@a[5],@a[5],@b[5]
	adc	$carry,xzr,xzr

	subs	@b[0],@a[0],@mod[0]
	sbcs	@b[1],@a[1],@mod[1]
	sbcs	@b[2],@a[2],@mod[2]
	sbcs	@b[3],@a[3],@mod[3]
	sbcs	@b[4],@a[4],@mod[4]
	sbcs	@b[5],@a[5],@mod[5]
	sbcs	xzr,$carry,xzr

	csel	@a[0],@a[0],@b[0],lo
	csel	@a[1],@a[1],@b[1],lo
	csel	@a[2],@a[2],@b[2],lo
	csel	@a[3],@a[3],@b[3],lo
	csel	@a[4],@a[4],@b[4],lo
	csel	@a[5],@a[5],@b[5],lo

	ret
.size	__add_mod_384,.-__add_mod_384

.globl	add_mod_384x
.hidden	add_mod_384x
.type	add_mod_384x,%function
.align	5
add_mod_384x:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__add_mod_384

	stp	@a[0],@a[1],[$r_ptr]
	add	$a_ptr,$a_ptr,#48
	stp	@a[2],@a[3],[$r_ptr,#16]
	add	$b_ptr,$b_ptr,#48
	stp	@a[4],@a[5],[$r_ptr,#32]

	bl	__add_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	add_mod_384x,.-add_mod_384x

.globl	rshift_mod_384
.hidden	rshift_mod_384
.type	rshift_mod_384,%function
.align	5
rshift_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

.Loop_rshift_mod_384:
	sub	$b_ptr,$b_ptr,#1
	bl	__rshift_mod_384
	cbnz	$b_ptr,.Loop_rshift_mod_384

	ldr	x30,[sp,#8]
	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	rshift_mod_384,.-rshift_mod_384

.type	__rshift_mod_384,%function
.align	5
__rshift_mod_384:
	sbfx	@b[5],@a[0],#0,#1
	 and	@b[0],@b[5],@mod[0]
	 and	@b[1],@b[5],@mod[1]
	adds	@a[0],@a[0],@b[0]
	 and	@b[2],@b[5],@mod[2]
	adcs	@a[1],@a[1],@b[1]
	 and	@b[3],@b[5],@mod[3]
	adcs	@a[2],@a[2],@b[2]
	 and	@b[4],@b[5],@mod[4]
	adcs	@a[3],@a[3],@b[3]
	 and	@b[5],@b[5],@mod[5]
	adcs	@a[4],@a[4],@b[4]
	 extr	@a[0],@a[1],@a[0],#1	// a[0:5] >>= 1
	adcs	@a[5],@a[5],@b[5]
	 extr	@a[1],@a[2],@a[1],#1
	adc	@b[5],xzr,xzr
	 extr	@a[2],@a[3],@a[2],#1
	 extr	@a[3],@a[4],@a[3],#1
	 extr	@a[4],@a[5],@a[4],#1
	 extr	@a[5],@b[5],@a[5],#1
	ret
.size	__rshift_mod_384,.-__rshift_mod_384

.globl	div_by_2_mod_384
.hidden	div_by_2_mod_384
.type	div_by_2_mod_384,%function
.align	5
div_by_2_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__rshift_mod_384

	ldr	x30,[sp,#8]
	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	div_by_2_mod_384,.-div_by_2_mod_384

.globl	lshift_mod_384
.hidden	lshift_mod_384
.type	lshift_mod_384,%function
.align	5
lshift_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

.Loop_lshift_mod_384:
	sub	$b_ptr,$b_ptr,#1
	bl	__lshift_mod_384
	cbnz	$b_ptr,.Loop_lshift_mod_384

	ldr	x30,[sp,#8]
	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	lshift_mod_384,.-lshift_mod_384

.type	__lshift_mod_384,%function
.align	5
__lshift_mod_384:
	adds	@a[0],@a[0],@a[0]
	adcs	@a[1],@a[1],@a[1]
	adcs	@a[2],@a[2],@a[2]
	adcs	@a[3],@a[3],@a[3]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	$carry,xzr,xzr

	subs	@b[0],@a[0],@mod[0]
	sbcs	@b[1],@a[1],@mod[1]
	sbcs	@b[2],@a[2],@mod[2]
	sbcs	@b[3],@a[3],@mod[3]
	sbcs	@b[4],@a[4],@mod[4]
	sbcs	@b[5],@a[5],@mod[5]
	sbcs	xzr,$carry,xzr

	csel	@a[0],@a[0],@b[0],lo
	csel	@a[1],@a[1],@b[1],lo
	csel	@a[2],@a[2],@b[2],lo
	csel	@a[3],@a[3],@b[3],lo
	csel	@a[4],@a[4],@b[4],lo
	csel	@a[5],@a[5],@b[5],lo

	ret
.size	__lshift_mod_384,.-__lshift_mod_384

.globl	mul_by_3_mod_384
.hidden	mul_by_3_mod_384
.type	mul_by_3_mod_384,%function
.align	5
mul_by_3_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__lshift_mod_384

	ldp	@b[0],@b[1],[$a_ptr]
	ldp	@b[2],@b[3],[$a_ptr,#16]
	ldp	@b[4],@b[5],[$a_ptr,#32]

	bl	__add_mod_384_ab_are_loaded
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	mul_by_3_mod_384,.-mul_by_3_mod_384

.globl	mul_by_8_mod_384
.hidden	mul_by_8_mod_384
.type	mul_by_8_mod_384,%function
.align	5
mul_by_8_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__lshift_mod_384
	bl	__lshift_mod_384
	bl	__lshift_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	mul_by_8_mod_384,.-mul_by_8_mod_384

.globl	mul_by_3_mod_384x
.hidden	mul_by_3_mod_384x
.type	mul_by_3_mod_384x,%function
.align	5
mul_by_3_mod_384x:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__lshift_mod_384

	ldp	@b[0],@b[1],[$a_ptr]
	ldp	@b[2],@b[3],[$a_ptr,#16]
	ldp	@b[4],@b[5],[$a_ptr,#32]

	bl	__add_mod_384_ab_are_loaded

	stp	@a[0],@a[1],[$r_ptr]
	ldp	@a[0],@a[1],[$a_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#16]
	ldp	@a[2],@a[3],[$a_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#32]
	ldp	@a[4],@a[5],[$a_ptr,#80]

	bl	__lshift_mod_384

	ldp	@b[0],@b[1],[$a_ptr,#48]
	ldp	@b[2],@b[3],[$a_ptr,#64]
	ldp	@b[4],@b[5],[$a_ptr,#80]

	bl	__add_mod_384_ab_are_loaded
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x

.globl	mul_by_8_mod_384x
.hidden	mul_by_8_mod_384x
.type	mul_by_8_mod_384x,%function
.align	5
mul_by_8_mod_384x:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]

	bl	__lshift_mod_384
	bl	__lshift_mod_384
	bl	__lshift_mod_384

	stp	@a[0],@a[1],[$r_ptr]
	ldp	@a[0],@a[1],[$a_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#16]
	ldp	@a[2],@a[3],[$a_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#32]
	ldp	@a[4],@a[5],[$a_ptr,#80]

	bl	__lshift_mod_384
	bl	__lshift_mod_384
	bl	__lshift_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x

.globl	cneg_mod_384
.hidden	cneg_mod_384
.type	cneg_mod_384,%function
.align	5
cneg_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]

	subs	@b[0],@mod[0],@a[0]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]
	 orr	$carry,@a[0],@a[1]
	sbcs	@b[1],@mod[1],@a[1]
	 orr	$carry,$carry,@a[2]
	sbcs	@b[2],@mod[2],@a[2]
	 orr	$carry,$carry,@a[3]
	sbcs	@b[3],@mod[3],@a[3]
	 orr	$carry,$carry,@a[4]
	sbcs	@b[4],@mod[4],@a[4]
	 orr	$carry,$carry,@a[5]
	sbc	@b[5],@mod[5],@a[5]

	cmp	$carry,#0
	csetm	$carry,ne
	ands	$b_ptr,$b_ptr,$carry

	csel	@a[0],@a[0],@b[0],eq
	csel	@a[1],@a[1],@b[1],eq
	csel	@a[2],@a[2],@b[2],eq
	csel	@a[3],@a[3],@b[3],eq
	stp	@a[0],@a[1],[$r_ptr]
	csel	@a[4],@a[4],@b[4],eq
	stp	@a[2],@a[3],[$r_ptr,#16]
	csel	@a[5],@a[5],@b[5],eq
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	cneg_mod_384,.-cneg_mod_384

.globl	sub_mod_384
.hidden	sub_mod_384
.type	sub_mod_384,%function
.align	5
sub_mod_384:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__sub_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr]
	stp	@a[2],@a[3],[$r_ptr,#16]
	stp	@a[4],@a[5],[$r_ptr,#32]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	sub_mod_384,.-sub_mod_384

.type	__sub_mod_384,%function
.align	5
__sub_mod_384:
	ldp	@a[0],@a[1],[$a_ptr]
	ldp	@b[0],@b[1],[$b_ptr]
	ldp	@a[2],@a[3],[$a_ptr,#16]
	ldp	@b[2],@b[3],[$b_ptr,#16]
	ldp	@a[4],@a[5],[$a_ptr,#32]
	ldp	@b[4],@b[5],[$b_ptr,#32]

	subs	@a[0],@a[0],@b[0]
	sbcs	@a[1],@a[1],@b[1]
	sbcs	@a[2],@a[2],@b[2]
	sbcs	@a[3],@a[3],@b[3]
	sbcs	@a[4],@a[4],@b[4]
	sbcs	@a[5],@a[5],@b[5]
	sbc	$carry,xzr,xzr

	 and	@b[0],@mod[0],$carry
	 and	@b[1],@mod[1],$carry
	adds	@a[0],@a[0],@b[0]
	 and	@b[2],@mod[2],$carry
	adcs	@a[1],@a[1],@b[1]
	 and	@b[3],@mod[3],$carry
	adcs	@a[2],@a[2],@b[2]
	 and	@b[4],@mod[4],$carry
	adcs	@a[3],@a[3],@b[3]
	 and	@b[5],@mod[5],$carry
	adcs	@a[4],@a[4],@b[4]
	adc	@a[5],@a[5],@b[5]

	ret
.size	__sub_mod_384,.-__sub_mod_384

.globl	sub_mod_384x
.hidden	sub_mod_384x
.type	sub_mod_384x,%function
.align	5
sub_mod_384x:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@mod[0],@mod[1],[$n_ptr]
	ldp	@mod[2],@mod[3],[$n_ptr,#16]
	ldp	@mod[4],@mod[5],[$n_ptr,#32]

	bl	__sub_mod_384

	stp	@a[0],@a[1],[$r_ptr]
	add	$a_ptr,$a_ptr,#48
	stp	@a[2],@a[3],[$r_ptr,#16]
	add	$b_ptr,$b_ptr,#48
	stp	@a[4],@a[5],[$r_ptr,#32]

	bl	__sub_mod_384
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	sub_mod_384x,.-sub_mod_384x

.globl	mul_by_1_plus_i_mod_384x
.hidden	mul_by_1_plus_i_mod_384x
.type	mul_by_1_plus_i_mod_384x,%function
.align	5
mul_by_1_plus_i_mod_384x:
	paciasp
	stp	x29,x30,[sp,#-48]!
	add	x29,sp,#0
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]

	ldp	@mod[0],@mod[1],[$b_ptr]
	ldp	@mod[2],@mod[3],[$b_ptr,#16]
	ldp	@mod[4],@mod[5],[$b_ptr,#32]
	add	$b_ptr,$a_ptr,#48

	bl	__sub_mod_384			// a->re - a->im

	ldp	@b[0],@b[1],[$a_ptr]
	ldp	@b[2],@b[3],[$a_ptr,#16]
	ldp	@b[4],@b[5],[$a_ptr,#32]
	stp	@a[0],@a[1],[$r_ptr]
	ldp	@a[0],@a[1],[$a_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#16]
	ldp	@a[2],@a[3],[$a_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#32]
	ldp	@a[4],@a[5],[$a_ptr,#80]

	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
	ldr	x30,[sp,#8]

	stp	@a[0],@a[1],[$r_ptr,#48]
	stp	@a[2],@a[3],[$r_ptr,#64]
	stp	@a[4],@a[5],[$r_ptr,#80]

	ldp	x19,x20,[x29,#16]
	ldp	x21,x22,[x29,#32]
	ldr	x29,[sp],#48
	autiasp
	ret
.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x

.globl	sgn0_pty_mod_384
.hidden	sgn0_pty_mod_384
.type	sgn0_pty_mod_384,%function
.align	5
sgn0_pty_mod_384:
	ldp	@a[0],@a[1],[$r_ptr]
	ldp	@a[2],@a[3],[$r_ptr,#16]
	ldp	@a[4],@a[5],[$r_ptr,#32]

	ldp	@mod[0],@mod[1],[$a_ptr]
	ldp	@mod[2],@mod[3],[$a_ptr,#16]
	ldp	@mod[4],@mod[5],[$a_ptr,#32]

	and	$r_ptr,@a[0],#1
	adds	@a[0],@a[0],@a[0]
	adcs	@a[1],@a[1],@a[1]
	adcs	@a[2],@a[2],@a[2]
	adcs	@a[3],@a[3],@a[3]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	$carry,xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	$carry,$carry,xzr

	mvn	$carry,$carry
	and	$carry,$carry,#2
	orr	$r_ptr,$r_ptr,$carry

	ret
.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384

.globl	sgn0_pty_mod_384x
.hidden	sgn0_pty_mod_384x
.type	sgn0_pty_mod_384x,%function
.align	5
sgn0_pty_mod_384x:
	ldp	@a[0],@a[1],[$r_ptr]
	ldp	@a[2],@a[3],[$r_ptr,#16]
	ldp	@a[4],@a[5],[$r_ptr,#32]

	ldp	@mod[0],@mod[1],[$a_ptr]
	ldp	@mod[2],@mod[3],[$a_ptr,#16]
	ldp	@mod[4],@mod[5],[$a_ptr,#32]

	and	$b_ptr,@a[0],#1
	 orr	$n_ptr,@a[0],@a[1]
	adds	@a[0],@a[0],@a[0]
	 orr	$n_ptr,$n_ptr,@a[2]
	adcs	@a[1],@a[1],@a[1]
	 orr	$n_ptr,$n_ptr,@a[3]
	adcs	@a[2],@a[2],@a[2]
	 orr	$n_ptr,$n_ptr,@a[4]
	adcs	@a[3],@a[3],@a[3]
	 orr	$n_ptr,$n_ptr,@a[5]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	@b[0],xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	@b[0],@b[0],xzr

	ldp	@a[0],@a[1],[$r_ptr,#48]
	ldp	@a[2],@a[3],[$r_ptr,#64]
	ldp	@a[4],@a[5],[$r_ptr,#80]

	mvn	@b[0],@b[0]
	and	@b[0],@b[0],#2
	orr	$b_ptr,$b_ptr,@b[0]

	and	$r_ptr,@a[0],#1
	 orr	$a_ptr,@a[0],@a[1]
	adds	@a[0],@a[0],@a[0]
	 orr	$a_ptr,$a_ptr,@a[2]
	adcs	@a[1],@a[1],@a[1]
	 orr	$a_ptr,$a_ptr,@a[3]
	adcs	@a[2],@a[2],@a[2]
	 orr	$a_ptr,$a_ptr,@a[4]
	adcs	@a[3],@a[3],@a[3]
	 orr	$a_ptr,$a_ptr,@a[5]
	adcs	@a[4],@a[4],@a[4]
	adcs	@a[5],@a[5],@a[5]
	adc	@b[0],xzr,xzr

	subs	@a[0],@a[0],@mod[0]
	sbcs	@a[1],@a[1],@mod[1]
	sbcs	@a[2],@a[2],@mod[2]
	sbcs	@a[3],@a[3],@mod[3]
	sbcs	@a[4],@a[4],@mod[4]
	sbcs	@a[5],@a[5],@mod[5]
	sbc	@b[0],@b[0],xzr

	mvn	@b[0],@b[0]
	and	@b[0],@b[0],#2
	orr	$r_ptr,$r_ptr,@b[0]

	cmp	$n_ptr,#0
	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)

	cmp	$a_ptr,#0
	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)

	and	$n_ptr,$n_ptr,#1
	and	$a_ptr,$a_ptr,#2
	orr	$r_ptr,$a_ptr,$n_ptr	// pack sign and parity

	ret
.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
___
if (1) {
sub vec_select {
my $sz = shift;
my @v=map("v$_",(0..5,16..21));

$code.=<<___;
.globl	vec_select_$sz
.hidden	vec_select_$sz
.type	vec_select_$sz,%function
.align	5
vec_select_$sz:
	dup	v6.2d, $n_ptr
	ld1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
	cmeq	v6.2d, v6.2d, #0
	ld1	{@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
___
for($i=0; $i<$sz-48; $i+=48) {
$code.=<<___;
	bit	@v[0].16b, @v[3].16b, v6.16b
	ld1	{@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
	bit	@v[1].16b, @v[4].16b, v6.16b
	ld1	{@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
	bit	@v[2].16b, @v[5].16b, v6.16b
	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
___
	@v = @v[6..11,0..5];
}
$code.=<<___;
	bit	@v[0].16b, @v[3].16b, v6.16b
	bit	@v[1].16b, @v[4].16b, v6.16b
	bit	@v[2].16b, @v[5].16b, v6.16b
	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
	ret
.size	vec_select_$sz,.-vec_select_$sz
___
}
vec_select(48);
vec_select(96);
vec_select(192);
vec_select(144);
vec_select(288);
}

{
my ($inp, $end, $step) = map("x$_", (0..2));

$code.=<<___;
.globl	vec_prefetch
.hidden	vec_prefetch
.type	vec_prefetch,%function
.align	5
vec_prefetch:
	add	$end, $end, $inp
	sub	$end, $end, #1
	mov	$step, #64
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	csel	$step, xzr, $step, hi
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	csel	$step, xzr, $step, hi
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	csel	$step, xzr, $step, hi
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	csel	$step, xzr, $step, hi
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	csel	$step, xzr, $step, hi
	prfm	pldl1keep, [$inp]
	add	$inp, $inp, $step
	cmp	$inp, $end
	csel	$inp, $end, $inp, hi
	prfm	pldl1keep, [$inp]
	ret
.size	vec_prefetch,.-vec_prefetch
___
}

print $code;

close STDOUT;
