TITLE	../openssl/crypto/bn/asm/x86-mont.asm
IF @Version LT 800
ECHO MASM version 8.00 or later is strongly recommended.
ENDIF
.686
.XMM
IF @Version LT 800
XMMWORD STRUCT 16
DQ	2 dup (?)
XMMWORD	ENDS
ENDIF

.MODEL	FLAT
OPTION	DOTNAME
IF @Version LT 800
.text$	SEGMENT PAGE 'CODE'
ELSE
.text$	SEGMENT ALIGN(64) 'CODE'
ENDIF
;EXTERN	_OPENSSL_ia32cap_P:NEAR
ALIGN	16
_bn_mul_mont	PROC PUBLIC
$L_bn_mul_mont_begin::
	push	ebp
	push	ebx
	push	esi
	push	edi
	xor	eax,eax
	mov	edi,DWORD PTR 40[esp]
	cmp	edi,4
	jl	$L000just_leave
	lea	esi,DWORD PTR 20[esp]
	lea	edx,DWORD PTR 24[esp]
	mov	ebp,esp
	add	edi,2
	neg	edi
	lea	esp,DWORD PTR [edi*4+esp-32]
	neg	edi
	mov	eax,esp
	sub	eax,edx
	and	eax,2047
	sub	esp,eax
	xor	edx,esp
	and	edx,2048
	xor	edx,2048
	sub	esp,edx
	and	esp,-64
	mov	eax,DWORD PTR [esi]
	mov	ebx,DWORD PTR 4[esi]
	mov	ecx,DWORD PTR 8[esi]
	mov	edx,DWORD PTR 12[esi]
	mov	esi,DWORD PTR 16[esi]
	mov	esi,DWORD PTR [esi]
	mov	DWORD PTR 4[esp],eax
	mov	DWORD PTR 8[esp],ebx
	mov	DWORD PTR 12[esp],ecx
	mov	DWORD PTR 16[esp],edx
	mov	DWORD PTR 20[esp],esi
	lea	ebx,DWORD PTR [edi-3]
	mov	DWORD PTR 24[esp],ebp
	lea	eax,DWORD PTR _OPENSSL_ia32cap_P
	bt	DWORD PTR [eax],26
	jnc	$L001non_sse2
	mov	eax,-1
	movd	mm7,eax
	mov	esi,DWORD PTR 8[esp]
	mov	edi,DWORD PTR 12[esp]
	mov	ebp,DWORD PTR 16[esp]
	xor	edx,edx
	xor	ecx,ecx
	movd	mm4,DWORD PTR [edi]
	movd	mm5,DWORD PTR [esi]
	movd	mm3,DWORD PTR [ebp]
	pmuludq	mm5,mm4
	movq	mm2,mm5
	movq	mm0,mm5
	pand	mm0,mm7
	pmuludq	mm5,QWORD PTR 20[esp]
	pmuludq	mm3,mm5
	paddq	mm3,mm0
	movd	mm1,DWORD PTR 4[ebp]
	movd	mm0,DWORD PTR 4[esi]
	psrlq	mm2,32
	psrlq	mm3,32
	inc	ecx
ALIGN	16
$L0021st:
	pmuludq	mm0,mm4
	pmuludq	mm1,mm5
	paddq	mm2,mm0
	paddq	mm3,mm1
	movq	mm0,mm2
	pand	mm0,mm7
	movd	mm1,DWORD PTR 4[ecx*4+ebp]
	paddq	mm3,mm0
	movd	mm0,DWORD PTR 4[ecx*4+esi]
	psrlq	mm2,32
	movd	DWORD PTR 28[ecx*4+esp],mm3
	psrlq	mm3,32
	lea	ecx,DWORD PTR 1[ecx]
	cmp	ecx,ebx
	jl	$L0021st
	pmuludq	mm0,mm4
	pmuludq	mm1,mm5
	paddq	mm2,mm0
	paddq	mm3,mm1
	movq	mm0,mm2
	pand	mm0,mm7
	paddq	mm3,mm0
	movd	DWORD PTR 28[ecx*4+esp],mm3
	psrlq	mm2,32
	psrlq	mm3,32
	paddq	mm3,mm2
	movq	QWORD PTR 32[ebx*4+esp],mm3
	inc	edx
$L003outer:
	xor	ecx,ecx
	movd	mm4,DWORD PTR [edx*4+edi]
	movd	mm5,DWORD PTR [esi]
	movd	mm6,DWORD PTR 32[esp]
	movd	mm3,DWORD PTR [ebp]
	pmuludq	mm5,mm4
	paddq	mm5,mm6
	movq	mm0,mm5
	movq	mm2,mm5
	pand	mm0,mm7
	pmuludq	mm5,QWORD PTR 20[esp]
	pmuludq	mm3,mm5
	paddq	mm3,mm0
	movd	mm6,DWORD PTR 36[esp]
	movd	mm1,DWORD PTR 4[ebp]
	movd	mm0,DWORD PTR 4[esi]
	psrlq	mm2,32
	psrlq	mm3,32
	paddq	mm2,mm6
	inc	ecx
	dec	ebx
$L004inner:
	pmuludq	mm0,mm4
	pmuludq	mm1,mm5
	paddq	mm2,mm0
	paddq	mm3,mm1
	movq	mm0,mm2
	movd	mm6,DWORD PTR 36[ecx*4+esp]
	pand	mm0,mm7
	movd	mm1,DWORD PTR 4[ecx*4+ebp]
	paddq	mm3,mm0
	movd	mm0,DWORD PTR 4[ecx*4+esi]
	psrlq	mm2,32
	movd	DWORD PTR 28[ecx*4+esp],mm3
	psrlq	mm3,32
	paddq	mm2,mm6
	dec	ebx
	lea	ecx,DWORD PTR 1[ecx]
	jnz	$L004inner
	mov	ebx,ecx
	pmuludq	mm0,mm4
	pmuludq	mm1,mm5
	paddq	mm2,mm0
	paddq	mm3,mm1
	movq	mm0,mm2
	pand	mm0,mm7
	paddq	mm3,mm0
	movd	DWORD PTR 28[ecx*4+esp],mm3
	psrlq	mm2,32
	psrlq	mm3,32
	movd	mm6,DWORD PTR 36[ebx*4+esp]
	paddq	mm3,mm2
	paddq	mm3,mm6
	movq	QWORD PTR 32[ebx*4+esp],mm3
	lea	edx,DWORD PTR 1[edx]
	cmp	edx,ebx
	jle	$L003outer
	emms
	jmp	$L005common_tail
ALIGN	16
$L001non_sse2:
	mov	esi,DWORD PTR 8[esp]
	lea	ebp,DWORD PTR 1[ebx]
	mov	edi,DWORD PTR 12[esp]
	xor	ecx,ecx
	mov	edx,esi
	and	ebp,1
	sub	edx,edi
	lea	eax,DWORD PTR 4[ebx*4+edi]
	or	ebp,edx
	mov	edi,DWORD PTR [edi]
	jz	$L006bn_sqr_mont
	mov	DWORD PTR 28[esp],eax
	mov	eax,DWORD PTR [esi]
	xor	edx,edx
ALIGN	16
$L007mull:
	mov	ebp,edx
	mul	edi
	add	ebp,eax
	lea	ecx,DWORD PTR 1[ecx]
	adc	edx,0
	mov	eax,DWORD PTR [ecx*4+esi]
	cmp	ecx,ebx
	mov	DWORD PTR 28[ecx*4+esp],ebp
	jl	$L007mull
	mov	ebp,edx
	mul	edi
	mov	edi,DWORD PTR 20[esp]
	add	eax,ebp
	mov	esi,DWORD PTR 16[esp]
	adc	edx,0
	imul	edi,DWORD PTR 32[esp]
	mov	DWORD PTR 32[ebx*4+esp],eax
	xor	ecx,ecx
	mov	DWORD PTR 36[ebx*4+esp],edx
	mov	DWORD PTR 40[ebx*4+esp],ecx
	mov	eax,DWORD PTR [esi]
	mul	edi
	add	eax,DWORD PTR 32[esp]
	mov	eax,DWORD PTR 4[esi]
	adc	edx,0
	inc	ecx
	jmp	$L0082ndmadd
ALIGN	16
$L0091stmadd:
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 32[ecx*4+esp]
	lea	ecx,DWORD PTR 1[ecx]
	adc	edx,0
	add	ebp,eax
	mov	eax,DWORD PTR [ecx*4+esi]
	adc	edx,0
	cmp	ecx,ebx
	mov	DWORD PTR 28[ecx*4+esp],ebp
	jl	$L0091stmadd
	mov	ebp,edx
	mul	edi
	add	eax,DWORD PTR 32[ebx*4+esp]
	mov	edi,DWORD PTR 20[esp]
	adc	edx,0
	mov	esi,DWORD PTR 16[esp]
	add	ebp,eax
	adc	edx,0
	imul	edi,DWORD PTR 32[esp]
	xor	ecx,ecx
	add	edx,DWORD PTR 36[ebx*4+esp]
	mov	DWORD PTR 32[ebx*4+esp],ebp
	adc	ecx,0
	mov	eax,DWORD PTR [esi]
	mov	DWORD PTR 36[ebx*4+esp],edx
	mov	DWORD PTR 40[ebx*4+esp],ecx
	mul	edi
	add	eax,DWORD PTR 32[esp]
	mov	eax,DWORD PTR 4[esi]
	adc	edx,0
	mov	ecx,1
ALIGN	16
$L0082ndmadd:
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 32[ecx*4+esp]
	lea	ecx,DWORD PTR 1[ecx]
	adc	edx,0
	add	ebp,eax
	mov	eax,DWORD PTR [ecx*4+esi]
	adc	edx,0
	cmp	ecx,ebx
	mov	DWORD PTR 24[ecx*4+esp],ebp
	jl	$L0082ndmadd
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 32[ebx*4+esp]
	adc	edx,0
	add	ebp,eax
	adc	edx,0
	mov	DWORD PTR 28[ebx*4+esp],ebp
	xor	eax,eax
	mov	ecx,DWORD PTR 12[esp]
	add	edx,DWORD PTR 36[ebx*4+esp]
	adc	eax,DWORD PTR 40[ebx*4+esp]
	lea	ecx,DWORD PTR 4[ecx]
	mov	DWORD PTR 32[ebx*4+esp],edx
	cmp	ecx,DWORD PTR 28[esp]
	mov	DWORD PTR 36[ebx*4+esp],eax
	je	$L005common_tail
	mov	edi,DWORD PTR [ecx]
	mov	esi,DWORD PTR 8[esp]
	mov	DWORD PTR 12[esp],ecx
	xor	ecx,ecx
	xor	edx,edx
	mov	eax,DWORD PTR [esi]
	jmp	$L0091stmadd
ALIGN	16
$L006bn_sqr_mont:
	mov	DWORD PTR [esp],ebx
	mov	DWORD PTR 12[esp],ecx
	mov	eax,edi
	mul	edi
	mov	DWORD PTR 32[esp],eax
	mov	ebx,edx
	shr	edx,1
	and	ebx,1
	inc	ecx
ALIGN	16
$L010sqr:
	mov	eax,DWORD PTR [ecx*4+esi]
	mov	ebp,edx
	mul	edi
	add	eax,ebp
	lea	ecx,DWORD PTR 1[ecx]
	adc	edx,0
	lea	ebp,DWORD PTR [eax*2+ebx]
	shr	eax,31
	cmp	ecx,DWORD PTR [esp]
	mov	ebx,eax
	mov	DWORD PTR 28[ecx*4+esp],ebp
	jl	$L010sqr
	mov	eax,DWORD PTR [ecx*4+esi]
	mov	ebp,edx
	mul	edi
	add	eax,ebp
	mov	edi,DWORD PTR 20[esp]
	adc	edx,0
	mov	esi,DWORD PTR 16[esp]
	lea	ebp,DWORD PTR [eax*2+ebx]
	imul	edi,DWORD PTR 32[esp]
	shr	eax,31
	mov	DWORD PTR 32[ecx*4+esp],ebp
	lea	ebp,DWORD PTR [edx*2+eax]
	mov	eax,DWORD PTR [esi]
	shr	edx,31
	mov	DWORD PTR 36[ecx*4+esp],ebp
	mov	DWORD PTR 40[ecx*4+esp],edx
	mul	edi
	add	eax,DWORD PTR 32[esp]
	mov	ebx,ecx
	adc	edx,0
	mov	eax,DWORD PTR 4[esi]
	mov	ecx,1
ALIGN	16
$L0113rdmadd:
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 32[ecx*4+esp]
	adc	edx,0
	add	ebp,eax
	mov	eax,DWORD PTR 4[ecx*4+esi]
	adc	edx,0
	mov	DWORD PTR 28[ecx*4+esp],ebp
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 36[ecx*4+esp]
	lea	ecx,DWORD PTR 2[ecx]
	adc	edx,0
	add	ebp,eax
	mov	eax,DWORD PTR [ecx*4+esi]
	adc	edx,0
	cmp	ecx,ebx
	mov	DWORD PTR 24[ecx*4+esp],ebp
	jl	$L0113rdmadd
	mov	ebp,edx
	mul	edi
	add	ebp,DWORD PTR 32[ebx*4+esp]
	adc	edx,0
	add	ebp,eax
	adc	edx,0
	mov	DWORD PTR 28[ebx*4+esp],ebp
	mov	ecx,DWORD PTR 12[esp]
	xor	eax,eax
	mov	esi,DWORD PTR 8[esp]
	add	edx,DWORD PTR 36[ebx*4+esp]
	adc	eax,DWORD PTR 40[ebx*4+esp]
	mov	DWORD PTR 32[ebx*4+esp],edx
	cmp	ecx,ebx
	mov	DWORD PTR 36[ebx*4+esp],eax
	je	$L005common_tail
	mov	edi,DWORD PTR 4[ecx*4+esi]
	lea	ecx,DWORD PTR 1[ecx]
	mov	eax,edi
	mov	DWORD PTR 12[esp],ecx
	mul	edi
	add	eax,DWORD PTR 32[ecx*4+esp]
	adc	edx,0
	mov	DWORD PTR 32[ecx*4+esp],eax
	xor	ebp,ebp
	cmp	ecx,ebx
	lea	ecx,DWORD PTR 1[ecx]
	je	$L012sqrlast
	mov	ebx,edx
	shr	edx,1
	and	ebx,1
ALIGN	16
$L013sqradd:
	mov	eax,DWORD PTR [ecx*4+esi]
	mov	ebp,edx
	mul	edi
	add	eax,ebp
	lea	ebp,DWORD PTR [eax*1+eax]
	adc	edx,0
	shr	eax,31
	add	ebp,DWORD PTR 32[ecx*4+esp]
	lea	ecx,DWORD PTR 1[ecx]
	adc	eax,0
	add	ebp,ebx
	adc	eax,0
	cmp	ecx,DWORD PTR [esp]
	mov	DWORD PTR 28[ecx*4+esp],ebp
	mov	ebx,eax
	jle	$L013sqradd
	mov	ebp,edx
	add	edx,edx
	shr	ebp,31
	add	edx,ebx
	adc	ebp,0
$L012sqrlast:
	mov	edi,DWORD PTR 20[esp]
	mov	esi,DWORD PTR 16[esp]
	imul	edi,DWORD PTR 32[esp]
	add	edx,DWORD PTR 32[ecx*4+esp]
	mov	eax,DWORD PTR [esi]
	adc	ebp,0
	mov	DWORD PTR 32[ecx*4+esp],edx
	mov	DWORD PTR 36[ecx*4+esp],ebp
	mul	edi
	add	eax,DWORD PTR 32[esp]
	lea	ebx,DWORD PTR [ecx-1]
	adc	edx,0
	mov	ecx,1
	mov	eax,DWORD PTR 4[esi]
	jmp	$L0113rdmadd
ALIGN	16
$L005common_tail:
	mov	ebp,DWORD PTR 16[esp]
	mov	edi,DWORD PTR 4[esp]
	lea	esi,DWORD PTR 32[esp]
	mov	eax,DWORD PTR [esi]
	mov	ecx,ebx
	xor	edx,edx
ALIGN	16
$L014sub:
	sbb	eax,DWORD PTR [edx*4+ebp]
	mov	DWORD PTR [edx*4+edi],eax
	dec	ecx
	mov	eax,DWORD PTR 4[edx*4+esi]
	lea	edx,DWORD PTR 1[edx]
	jge	$L014sub
	sbb	eax,0
	and	esi,eax
	not	eax
	mov	ebp,edi
	and	ebp,eax
	or	esi,ebp
ALIGN	16
$L015copy:
	mov	eax,DWORD PTR [ebx*4+esi]
	mov	DWORD PTR [ebx*4+edi],eax
	mov	DWORD PTR 32[ebx*4+esp],ecx
	dec	ebx
	jge	$L015copy
	mov	esp,DWORD PTR 24[esp]
	mov	eax,1
$L000just_leave:
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret
_bn_mul_mont ENDP
DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
DB	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
DB	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
DB	111,114,103,62,0
.text$	ENDS
.bss	SEGMENT 'BSS'
COMM	_OPENSSL_ia32cap_P:DWORD:4
.bss	ENDS
END
