/* Copyright (C) 1996 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by David Mosberger <davidm@cs.arizona.edu>, 1996.
   Based on public-domain C source by Linus Torvalds.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU C Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA. */

/* Faster 1 ulp error version by removing final last-bit fiddling, by 
   Joachim Wesner <joachim.wesner@frankfurt.netsurf.de>, July 1998 */

/* Modified and re-scheduled by Kazushige Goto <goto@statabo.rim.or.jp> */

/* Added exceptional handling by Kazushige Goto <goto@statabo.rim.or.jp> */
     
	.set noat
	.set noreorder

#ifdef __ELF__
	.section .rodata
#else
	.rdata
#endif

	.align 5        # align to cache line

sqrtdata:
.long   0x1500, 0x2ef8,   0x4d67,  0x6b02,  0x87be,  0xa395,  0xbe7a,  0xd866
.long   0xf14a, 0x1091b, 0x11fcd, 0x13552, 0x14999, 0x15c98, 0x16e34, 0x17e5f
.long  0x18d03, 0x19a01, 0x1a545, 0x1ae8a, 0x1b5c4, 0x1bb01, 0x1bfde, 0x1c28d
.long  0x1c2de, 0x1c0db, 0x1ba73, 0x1b11c, 0x1a4b5, 0x1953d, 0x18266, 0x16be0
.long  0x1683e, 0x179d8, 0x18a4d, 0x19992, 0x1a789, 0x1b445, 0x1bf61, 0x1c989
.long  0x1d16d, 0x1d77b, 0x1dddf, 0x1e2ad, 0x1e5bf, 0x1e6e8, 0x1e654, 0x1e3cd
.long  0x1df2a, 0x1d635, 0x1cb16, 0x1be2c, 0x1ae4e, 0x19bde, 0x1868e, 0x16e2e
.long  0x1527f, 0x1334a, 0x11051,  0xe951,  0xbe01,  0x8e0d,  0x5924,  0x1edd

	.text

	.globl  sqrt
	.align 5
	.ent  sqrt
sqrt: 	
	.frame $30 ,  32, $26  
	lda	$30 ,    -32($30)
	ldgp	$29 , .-sqrt($27)
	stt	$f16, 0($30) 

#ifdef PROF
	lda	$28, _mcount
	jsr	$28, ($28), _mcount
	unop
	unop
#endif
	.prologue 1

	lda	$4 , sqrtdata			# load base address into $4 
	ldah	$2 , 0x5fe8
	lda	$18, 0x3fe0
	lda	$21, 0x7ff

	ldq	$3 ,  0($30)
	sll	$18, 48, $18
	lda	$19, 0x4008
	stq	$18, 0x08($30)			# 0.5

	srl	$3 , 33, $5
	nop
	subl	$2 , $5 , $2
	srl	$3,   52, $20

	srl	$2 , 12, $1
	ldt	$f10, 0x08($30)
	and	$1 , 0xfc, $1
	sll	$19, 48, $19

	addq	$1 , $4 , $1
	cmpteq	$f31, $f31, $f11		# 2.0
	and	$21, $20, $23
	addt	$f10, $f10, $f12		# 0.5 + 0.5 = 1.0

	ldl	$1 , 0x0 ($1)
	cmpeq	$21, $23, $24
	ldah	$20,-0x10($19)
	bne	$24, $Inf_and_Nan

	subl	$2 , $1 , $2
	stq	$20, 0x10($30)			# 3.0 - 4.0e-30
	sll	$2 , 32, $2
	fblt	$f16, $NaN

	stq	$2 ,  0x18($30)
	addt	$f11, $f12, $f11		# $f11 = 3.0
	beq	$23, $SubNormal
	nop

	ldt	$f13, 0x10($30)
	ldt	$f17, 0x18($30)

	# $f10 : 0.5
	# $f11 : 3.0
	# $f12 : 1.0
	# $f13 : 3.0(nearly)

	mult	$f16, $f17, $f18		# x * y
	mult	$f10, $f17, $f19		# 0.5 * y
	mult	$f18, $f17, $f21		# x * y * y
	subt	$f11, $f21, $f18		# 3. - x * y * y
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f16, $f17, $f18		# x * y
	mult	$f10, $f17, $f19		# 0.5 * y
	mult	$f18, $f17, $f18		# x * y * y
	subt	$f13, $f18, $f18		# (3.-4.0e-30) - x * y * y
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f16, $f17, $f18		# z = x * y
	mult	$f18, $f17, $f20		# z * y
	mult	$f10, $f18, $f19		# z * 0.5
	subt	$f12, $f20, $f20		# 1.0 - z * y
	mult	$f19, $f20, $f20		# z * 0.5 *(1.0-z*y)
	addt	$f18, $f20, $f0			# z +z * 0.5 *(1.0-z*y)

	addq	$30 , 32, $30
	ret
	.align 4

	/* Exceptional Handling */
	/* sqrt(NaN) = NaN, sqrt(+inf) = +inf, sqrt(-inf) = NaN */
$Inf_and_Nan:
	lda	$4, 0xfff0
	sll	$4, 48, $4
	andnot	$3, $4, $0
	bne	$0, $NaN
	.align 4

$Infinity:
	blt	$3, $NaN
	fmov	$f16, $f0
	addq	$30 , 32, $30
	ret
	.align 4

$NaN:
	lda	$0, -1
	stq	$0,  0($30)
	ldt	$f0, 0($30)
	addq	$30 , 32, $30
	ret
	.align 4

$SubNormal:
	lda	$20, 0x7ff
	ldah	$23, 0x0010
	sll	$20,  52, $20
	fbeq	$f16, $Zero

	andnot	$3,  $20, $22		# Extract floating point fp($22)
	sll	$23, 32, $23		# generate 0x00100000 00000000
	lda	$21, 2			# Clear exp part($21) and add 2
	.align 4

	zap	$22, 0x0f, $25
	bne	$25, $SubNext1
	sll	$22, 20, $22
	addq	$21, 20, $21

	zap	$22, 0x0f, $25
	bne	$25, $SubNext1
	sll	$22, 20, $22
	addq	$21, 20, $21
	.align 4

$SubNext1:
	zap	$22, 0x1f, $25
	bne	$25, $SubNext2
	sll	$22, 12, $22
	addq	$21, 12, $21

$SubNext2:
	zap	$22, 0x3f, $25
	bne	$25, $SubLoop
	sll	$22, 4, $22
	addq	$21, 4, $21
	.align 4

$SubLoop:
	sll	$22,   1, $22		# fp << 1
	and	$22, $23, $0
	
	addq	$21,   1, $21		# exp++
	beq	$0, $SubLoop
	.align 4

	s4subq	$23, $23, $25		# 0x0030....
	addq	$23, $23, $24		# 0x0020....
	cmovlbc	$21, $25, $24
	andnot	$22, $20, $22

	or	$22, $24, $3
	srl	$21,   1, $21		# exp >>= 1
	stq	$3,   0($30)
	ldah	$2 , 0x5fe8

	srl	$3 , 33, $5
	subl	$2 , $5 , $2
	srl	$2 , 12, $1
	and	$1 , 0xfc, $1
	addq	$1 , $4 , $1
	ldl	$1 , 0x0 ($1)
	subl	$2 , $1 , $2
	sll	$2 , 32, $2
	stq	$2 ,  0x18($30)

	ldt	$f16, 0x00($30)
	ldt	$f13, 0x10($30)
	ldt	$f17, 0x18($30)

	mult	$f16, $f17, $f18		# x * y
	mult	$f10, $f17, $f19		# 0.5 * y
	mult	$f18, $f17, $f21		# x * y * y
	subt	$f11, $f21, $f18		# 3. - x * y * y
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)

	mult	$f16, $f17, $f18		# x * y
	lda	$22, 1023
	mult	$f10, $f17, $f19		# 0.5 * y
	subq	$22, $21, $21
	mult	$f18, $f17, $f18		# x * y * y
	sll	$21,  52, $21
	subt	$f13, $f18, $f18		# (3.-4.0e-30) - x * y * y
	mult	$f19, $f18, $f17		# 0.5 * y * ( 3.0 - x * y * y)
	stq	$21, 0($30)
	mult	$f16, $f17, $f18		# z = x * y
	mult	$f18, $f17, $f20		# z * y
	mult	$f10, $f18, $f19		# z * 0.5
	subt	$f12, $f20, $f20		# 1.0 - z * y
	ldt	$f1, 0($30)
	mult	$f19, $f20, $f20		# z * 0.5 *(1.0-z*y)
	addt	$f18, $f20, $f0			# z +z * 0.5 *(1.0-z*y)
	mult	$f0,  $f1,  $f0

	addq	$30 , 32, $30
	ret
	.align 4

$Zero:
	fmov	$f16, $f0
	addq	$30 , 32, $30
	ret
	.end sqrt
