#!/usr/bin/env perl
#
# Copyright Supranational LLC
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
    or die "can't call $xlate: $!";

$c_ref=<<'___';
/*
 * |div_top| points at two most significant limbs of the dividend, |d_hi|
 * and |d_lo| are two most significant limbs of the divisor. If divisor
 * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
 * The divisor is required to be "bitwise left-aligned," and dividend's
 * top limbs to be not larger than the divisor's. The latter limitation
 * can be problematic in the first iteration of multi-precision division,
 * where in most general case the condition would have to be "smaller."
 * The subroutine considers four limbs, two of which are "overlapping,"
 * hence the name... Another way to look at it is to think of the pair
 * of the dividend's limbs being suffixed with a zero:
 *   +-------+-------+-------+
 * R |       |       |   0   |
 *   +-------+-------+-------+
 *           +-------+-------+
 * D         |       |       |
 *           +-------+-------+
 */
limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
{
    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
    limb_t Q = 0, mask;
    size_t i;

    for (i = 0; i < LIMB_BITS; i++) {
        Q <<= 1;
        mask = (R >= D);
        Q |= mask;
        R -= (D & ((llimb_t)0 - mask));
        D >>= 1;
    }

    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */

    Q <<= 1;
    Q |= (R >= D);

    return (Q | mask);
}
___

$code.=<<___;
.text

.globl	div_3_limbs
.hidden	div_3_limbs
.type	div_3_limbs,\@function,3
.align	32
div_3_limbs:
	mov	(%rdi),%r8		# load R.lo
	mov	8(%rdi),%r9		# load R.hi
	xor	%rax,%rax		# Q = 0
	mov	\$64,%ecx		# loop counter

.Loop:
	 mov	%r8,%r10		# put aside R
	sub	%rsi,%r8		# R -= D
	 mov	%r9,%r11
	sbb	%rdx,%r9
	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
	 mov	%rdx,%rdi
	cmovc	%r10,%r8		# restore R if R - D borrowed
	cmovc	%r11,%r9
	sbb	\$0,%rax		# subtract speculative bit
	 shl	\$63,%rdi
	 shr	\$1,%rsi
	 shr	\$1,%rdx
	 or	%rdi,%rsi		# D >>= 1
	sub	\$1,%ecx
	jnz	.Loop

	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
	sar	\$63,%rax		# top bit -> mask

	sub	%rsi,%r8		# R -= D
	sbb	%rdx,%r9
	sbb	\$0,%rcx		# subtract speculative bit

	or	%rcx,%rax		# all ones if overflow

	ret
.size	div_3_limbs,.-div_3_limbs
___
########################################################################
# Calculate remainder and adjust the quotient, which can be off-by-one.
# Then save quotient in limb next to top limb of the remainder. There is
# place, because the remainder/next-iteration-dividend gets shorter by
# one limb.
{
my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
my @acc = ("%r8", "%r9", "%rdx");
my @tmp = ("%r10", "%r11", "%rax");

$code.=<<___;
.globl	quot_rem_128
.hidden	quot_rem_128
.type	quot_rem_128,\@function,3
.align	32
quot_rem_128:
	mov	%rdx, %rax
	mov	%rdx, $quotient

	mulq	0($divisor)		# divisor[0:1] * quotient
	mov	%rax, @acc[0]
	mov	$quotient, %rax
	mov	%rdx, @acc[1]

	mulq	8($divisor)
	add	%rax, @acc[1]
	adc	\$0, %rdx		# %rdx is @acc[2]

	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
	mov	8($div_rem), @tmp[1]
	mov	16($div_rem), @tmp[2]

	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
	sbb	@acc[1], @tmp[1]
	sbb	@acc[2], @tmp[2]
	sbb	@acc[0], @acc[0]	# borrow -> mask

	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
	mov	@acc[0], @acc[1]
	and	0($divisor), @acc[0]
	and	8($divisor), @acc[1]
	add	@acc[0], @tmp[0]	# ... and add divisor
	adc	@acc[1], @tmp[1]

	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
	mov	@tmp[1], 8($div_rem)
	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient

	mov	$quotient, %rax		# return adjusted quotient

	ret
.size	quot_rem_128,.-quot_rem_128

########################################################################
# Unlike 128-bit case above, quotient is exact. As result just one limb
# of the dividend is sufficient to calculate the remainder...

.globl	quot_rem_64
.hidden	quot_rem_64
.type	quot_rem_64,\@function,3
.align	32
quot_rem_64:
	mov	%rdx, %rax		# return quotient
	imulq	0($divisor), %rdx	# divisor[0] * quotient

	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend

	sub	%rdx, @tmp[0]		# dividend - divisor * quotient

	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient

	ret
.size	quot_rem_64,.-quot_rem_64
___
}

print $code;
close STDOUT;