#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Both constant-time and fast quadratic residue test as suggested in
# https://eprint.iacr.org/2020/972. Performance is >12x better [on
# Cortex cores] than modulus-specific Legendre symbol addition chain...
#
# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
#
$python_ref.=<<'___';
def ct_is_square_mod_384(inp, mod):
    a = inp
    b = mod
    L = 0   # only least significant bit, adding 1 makes up for sign change

    k = 30
    w = 32
    mask = (1 << w) - 1

    for i in range(0, 768 // k - 1):
        # __ab_approximation_30
        n = max(a.bit_length(), b.bit_length())
        if n < 64:
            a_, b_ = a, b
        else:
            a_ = (a & mask) | ((a >> (n-w)) << w)
            b_ = (b & mask) | ((b >> (n-w)) << w)

        # __inner_loop_30
        f0, g0, f1, g1 = 1, 0, 0, 1
        for j in range(0, k):
            if a_ & 1:
                if a_ < b_:
                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
                                        # tell the whole story
                a_, f0, g0 = a_-b_, f0-f1, g0-g1
            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]

        # __smulq_384_n_shift_by_30
        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
        if b < 0:
            b = -b
        if a < 0:
            a = -a
            L += (b % 4) >> 1           # |b| is always odd, the second bit
                                        # tells the whole story

    if True:
        for j in range(0, 768 % k + k):
            if a & 1:
                if a < b:
                    a, b = b, a
                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
                                        # tell the whole story
                a = a-b
            a = a >> 1
            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]

    return (L & 1) ^ 1
___

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
my @acc=map("x$_",(3..14));
my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
my @t = map("x$_",(21..28));
my ($a_, $b_) = @acc[5,11];

$frame = 2*256;

$code.=<<___;
.text

.globl	ct_is_square_mod_384
.type	ct_is_square_mod_384, %function
.align	5
ct_is_square_mod_384:
	paciasp
	stp	x29, x30, [sp,#-128]!
	add	x29, sp, #0
	stp	x19, x20, [sp,#16]
	stp	x21, x22, [sp,#32]
	stp	x23, x24, [sp,#48]
	stp	x25, x26, [sp,#64]
	stp	x27, x28, [sp,#80]
	sub	sp, sp, #$frame

	ldp	@acc[0], @acc[1], [x0,#8*0]		// load input
	ldp	@acc[2], @acc[3], [x0,#8*2]
	ldp	@acc[4], @acc[5], [x0,#8*4]

	add	$in_ptr, sp, #255	// find closest 256-byte-aligned spot
	and	$in_ptr, $in_ptr, #-256	// in the frame...

	ldp	@acc[6], @acc[7], [x1,#8*0]		// load modulus
	ldp	@acc[8], @acc[9], [x1,#8*2]
	ldp	@acc[10], @acc[11], [x1,#8*4]

	stp	@acc[0], @acc[1], [$in_ptr,#8*6]	// copy input to |a|
	stp	@acc[2], @acc[3], [$in_ptr,#8*8]
	stp	@acc[4], @acc[5], [$in_ptr,#8*10]
	stp	@acc[6], @acc[7], [$in_ptr,#8*0]	// copy modulus to |b|
	stp	@acc[8], @acc[9], [$in_ptr,#8*2]
	stp	@acc[10], @acc[11], [$in_ptr,#8*4]

	eor	$L, $L, $L			// init the Legendre symbol
	mov	$cnt, #24			// 24 is 768/30-1
	b	.Loop_is_square

.align	4
.Loop_is_square:
	bl	__ab_approximation_30
	sub	$cnt, $cnt, #1

	eor	$out_ptr, $in_ptr, #128		// pointer to dst |b|
	bl	__smul_384_n_shift_by_30

	mov	$f1, $f0			// |f0|
	mov	$g1, $g0			// |g0|
	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |a|
	bl	__smul_384_n_shift_by_30

	ldp	@acc[6], @acc[7], [$out_ptr,#-8*6]
	eor	$in_ptr, $in_ptr, #128		// flip-flop src |a|b|
	and	@t[6], @t[6], @acc[6]		// if |a| was negative,
	add	$L, $L, @t[6], lsr#1		// adjust |L|

	cbnz	$cnt, .Loop_is_square

	////////////////////////////////////////// last iteration
	//bl	__ab_approximation_30		// |a| and |b| are exact,
	//ldr	$a_, [$in_ptr,#8*6]		// just load
	mov	$b_, @acc[6]			// ldr	$b_, [$in_ptr,#8*0]
	mov	$cnt, #48			// 48 is 768%30 + 30
	bl	__inner_loop_48
	ldr	x30, [x29,#8]

	and	x0, $L, #1
	eor	x0, x0, #1

	add	sp, sp, #$frame
	ldp	x19, x20, [x29,#16]
	ldp	x21, x22, [x29,#32]
	ldp	x23, x24, [x29,#48]
	ldp	x25, x26, [x29,#64]
	ldp	x27, x28, [x29,#80]
	ldr	x29, [sp],#128
	autiasp
	ret
.size	ct_is_square_mod_384,.-ct_is_square_mod_384

.type	__smul_384_n_shift_by_30, %function
.align	5
__smul_384_n_shift_by_30:
___
for($j=0; $j<2; $j++) {
my $fx = $g1;   $fx = $f1           if ($j);
my @acc = @acc; @acc = @acc[6..11]  if ($j);
my $k = 8*6*$j;
$code.=<<___;
	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |b| (or |a|)
	asr	@t[6], $fx, #63		// |g1|'s sign as mask (or |f1|'s)
	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
	eor	$fx, $fx, @t[6]		// conditionally negate |g1| (or |f1|)
	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]

	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |b| (or |a|)
	sub	$fx, $fx, @t[6]
	eor	@acc[1], @acc[1], @t[6]
	adds	@acc[0], @acc[0], @t[6], lsr#63
	eor	@acc[2], @acc[2], @t[6]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[6]
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[4], @acc[4], @t[6]
	 umulh	@t[0], @acc[0], $fx
	adcs	@acc[3], @acc[3], xzr
	 umulh	@t[1], @acc[1], $fx
	eor	@acc[5], @acc[5], @t[6]
	 umulh	@t[2], @acc[2], $fx
	adcs	@acc[4], @acc[4], xzr
	 umulh	@t[3], @acc[3], $fx
	adc	@acc[5], @acc[5], xzr

	umulh	@t[4], @acc[4], $fx
	and	@t[7], $fx, @t[6]
	umulh	@t[5+$j], @acc[5], $fx
	neg	@t[7], @t[7]
	mul	@acc[0], @acc[0], $fx
	mul	@acc[1], @acc[1], $fx
	mul	@acc[2], @acc[2], $fx
	adds	@acc[1], @acc[1], @t[0]
	mul	@acc[3], @acc[3], $fx
	adcs	@acc[2], @acc[2], @t[1]
	mul	@acc[4], @acc[4], $fx
	adcs	@acc[3], @acc[3], @t[2]
	mul	@acc[5], @acc[5], $fx
	adcs	@acc[4], @acc[4], @t[3]
	adcs	@acc[5], @acc[5] ,@t[4]
	adc	@t[5+$j], @t[5+$j], @t[7]
___
}
$code.=<<___;
	adds	@acc[0], @acc[0], @acc[6]
	adcs	@acc[1], @acc[1], @acc[7]
	adcs	@acc[2], @acc[2], @acc[8]
	adcs	@acc[3], @acc[3], @acc[9]
	adcs	@acc[4], @acc[4], @acc[10]
	adcs	@acc[5], @acc[5], @acc[11]
	adc	@acc[6], @t[5],   @t[6]

	extr	@acc[0], @acc[1], @acc[0], #30
	extr	@acc[1], @acc[2], @acc[1], #30
	extr	@acc[2], @acc[3], @acc[2], #30
	asr	@t[6], @acc[6], #63
	extr	@acc[3], @acc[4], @acc[3], #30
	extr	@acc[4], @acc[5], @acc[4], #30
	extr	@acc[5], @acc[6], @acc[5], #30

	eor	@acc[0], @acc[0], @t[6]
	eor	@acc[1], @acc[1], @t[6]
	adds	@acc[0], @acc[0], @t[6], lsr#63
	eor	@acc[2], @acc[2], @t[6]
	adcs	@acc[1], @acc[1], xzr
	eor	@acc[3], @acc[3], @t[6]
	adcs	@acc[2], @acc[2], xzr
	eor	@acc[4], @acc[4], @t[6]
	adcs	@acc[3], @acc[3], xzr
	eor	@acc[5], @acc[5], @t[6]
	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
	adcs	@acc[4], @acc[4], xzr
	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
	adc	@acc[5], @acc[5], xzr
	stp	@acc[4], @acc[5], [$out_ptr,#8*4]

	ret
.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
___

{
my @a = @acc[0..5];
my @b = @acc[6..11];
my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);

$code.=<<___;
.type	__ab_approximation_30, %function
.align	4
__ab_approximation_30:
	ldp	@b[4], @b[5], [$in_ptr,#8*4]	// |a| is still in registers
	ldp	@b[2], @b[3], [$in_ptr,#8*2]

	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[3], ne
	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
	csel	@b[4], @b[4], @b[3], ne

	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[2], ne
	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
	csel	@b[4], @b[4], @b[2], ne

	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[1], ne
	orr	@t[0], @a[5], @b[5]	// and one more, ...
	csel	@b[4], @b[4], @b[1], ne

	cmp	@t[0], #0
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	csel	@a[4], @a[4], @a[0], ne
	orr	@t[0], @a[5], @b[5]
	csel	@b[4], @b[4], @b[0], ne

	clz	@t[0], @t[0]
	cmp	@t[0], #64
	csel	@t[0], @t[0], xzr, ne
	csel	@a[5], @a[5], @a[4], ne
	csel	@b[5], @b[5], @b[4], ne
	neg	@t[1], @t[0]

	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
	lslv	@b[5], @b[5], @t[0]
	lsrv	@a[4], @a[4], @t[1]
	lsrv	@b[4], @b[4], @t[1]
	and	@a[4], @a[4], @t[1], asr#6
	and	@b[4], @b[4], @t[1], asr#6
	orr	$a_, @a[5], @a[4]
	orr	$b_, @b[5], @b[4]

	bfxil	$a_, @a[0], #0, #32
	bfxil	$b_, @b[0], #0, #32

	b	__inner_loop_30
	ret
.size	__ab_approximation_30,.-__ab_approximation_30

.type	__inner_loop_30, %function
.align	4
__inner_loop_30:
	mov	$cnt, #30
	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
	mov	$bias,#0x7FFFFFFF7FFFFFFF

.Loop_30:
	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
	 and	@t[4], $a_, $b_
	sub	$cnt, $cnt, #1
	and	@t[0], $b_, @t[3]

	sub	@t[1], $b_, $a_		// |b_|-|a_|
	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	 add	@t[4], $L, @t[4], lsr#1	// L + (a_ & b_) >> 1
	mov	@t[0], $fg1
	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	csel	$fg1, $fg1, $fg0,  hs	// exchange |fg0| and |fg1|
	csel	$fg0, $fg0, @t[0], hs
	 csel	$L,   $L,   @t[4], hs
	lsr	$a_, $a_, #1
	and	@t[0], $fg1, @t[3]
	and	@t[1], $bias, @t[3]
	 add	$t[2], $b_, #2
	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	add	$fg1, $fg1, $fg1	// |f1|<<=1
	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
	add	$fg0, $fg0, @t[1]
	sub	$fg1, $fg1, $bias

	cbnz	$cnt, .Loop_30

	mov	$bias, #0x7FFFFFFF
	ubfx	$f0, $fg0, #0, #32
	ubfx	$g0, $fg0, #32, #32
	ubfx	$f1, $fg1, #0, #32
	ubfx	$g1, $fg1, #32, #32
	sub	$f0, $f0, $bias		// remove the bias
	sub	$g0, $g0, $bias
	sub	$f1, $f1, $bias
	sub	$g1, $g1, $bias

	ret
.size	__inner_loop_30,.-__inner_loop_30
___
}

$code.=<<___;
.type	__inner_loop_48, %function
.align	4
__inner_loop_48:
.Loop_48:
	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
	 and	@t[4], $a_, $b_
	sub	$cnt, $cnt, #1
	and	@t[0], $b_, @t[3]
	sub	@t[1], $b_, $a_		// |b_|-|a_|
	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	 add	@t[4], $L, @t[4], lsr#1
	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	 csel	$L,   $L,   @t[4], hs
	 add	$t[2], $b_, #2
	lsr	$a_, $a_, #1
	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5

	cbnz	$cnt, .Loop_48

	ret
.size	__inner_loop_48,.-__inner_loop_48
___

print $code;
close STDOUT;